src/hg/makeDb/doc/mm9.txt 1.126

1.126 2010/02/23 03:37:21 rhead
Added note for next time about NIAGene track.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.125
retrieving revision 1.126
diff -b -B -U 1000000 -r1.125 -r1.126
--- src/hg/makeDb/doc/mm9.txt	16 Feb 2010 04:47:43 -0000	1.125
+++ src/hg/makeDb/doc/mm9.txt	23 Feb 2010 03:37:21 -0000	1.126
@@ -1,10214 +1,10217 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the mouse
 # genome, April 2007, ncbi mouse_37 - Mm9
 #
 #	"$Id$"
 #
 
 #######################################################################
 # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2007-04-05 - Hiram)
 #
 #	Examine disk space issues, find some goodly amount of space
     ssh kkstore02
     mkdir /cluster/store5/mm9
     ln -s /cluster/store5/mm9 /cluster/data/mm9
     cd /cluster/data/mm9
     ## After testing with the pre-release below, the real thing begins here
     mkdir mouse_37
     cd mouse_37
     ## Ouch, the files are no longer delivered conveniently in a single
     ## directory.  They are in several locations now ...
     
 NCBI=ftp://ftp.ncbi.nih.gov/genomes
 MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/initial_release
 for F in README README_CURRENT_BUILD
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/${F}" -O ${F}
 done
 for F in allcontig.agp.gz seq_contig.md.gz ideogram.gz
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/${MAPVIEW}/${F}" -O ${F}
 done
     # survey the strains contained in seq_contig.md.gz
     zcat seq_contig.md.gz | awk '{print $9}' | sort | uniq -c | sort -rn
   13075 Celera
     360 C57BL/6J
     101 129/SvJ
      93 129/Sv
      79 unknown
      75 129/SvEvTac
      40 NOD
      26 129S7/SvEv
      14 129/Ola
       7 129
       6 Cast/Ei
       6 BALB/c
       3 SJL/J
       3 C3H
       3 B6/CBAF1J
       3 AKR/J
       3 A/J
       2 Spret/Ei
       1 group_label
       1 129/J
     # we will work on the C57BL/6J strain
 
 mkdir -p chrAgp
 cd chrAgp
 for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.agp.gz" \
         -O chr${C}.agp.gz
 done
 
 cd ..
 for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
 do
     zcat chrAgp/chr${C}.agp.gz | grep "^c"
 done > chrOnly.agp
 
 mkdir -p chrfasta
 cd chrfasta
 for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.fa.gz" \
         -O chr${C}.fa.gz
 done
 
 cd ..
 mkdir chrUn
 mkdir chrM
 wget  --dont-remove-listing --timestamping \
 	"${NCBI}/M_musculus/CHR_Un/mm_ref_chrUn.fa.gz -O chrUn/chrUn.fa.gz
 wget  --dont-remove-listing --timestamping \
 	"${NCBI}/M_musculus/CHR_MT/mm_ref_chrUn.fa.gz \
 	-O chrM/mm_ref_chrUn.fa.gz
 
 mkdir contigFasta
 for C in 1 2 3 4 5 6 7 8 9
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/CHR_0${C}/mm_ref_chr${C}.fa.gz" \
         -O contigFasta/chr${C}.fa.gz
 done
 for C in 0 1 2 3 4 5 6 7 8 9
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/CHR_1${C}/mm_ref_chr1${C}.fa.gz" \
         -O contigFasta/chr1${C}.fa.gz
 done
 for C in X Y Un MT
 do
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/M_musculus/CHR_${C}/mm_ref_chr${C}.fa.gz" \
         -O contigFasta/chr${C}.fa.gz
 done
     mv contigFasta/chrMT.fa.gz contigFasta/chrM.fa.gz
     ## split up the contigFasta files into their individual contigs
     ## the sed fixes the fasta header name to just be the contig name
     mkdir splitContigs
 for F in contigFasta/chr*.fa.gz
 do
     BN=`basename ${F}`
     C=${BN/.fa.gz/}
     echo $F $BN $C
     echo -n "${C} working ... "
     mkdir -p splitContigs/${C}
     zcat ${F} | sed -e "s/.*ref|/>/; s/|.*//" \
 	| faSplit byname stdin splitContigs/${C}/
     echo "done"
 done
     ## create agp files for the randoms from seq_contig.md and allcontig.agp
     ## both fragment and contig agp files
     $HOME/kent/src/hg/mouseStuff/buildTools/seqContigToAgp.pl \
 	randomFragments.agp randomContigs.agp 2> randomContigs.err
     ## create contig agp file for non-randoms
     $HOME/kent/src/hg/mouseStuff/buildTools/mkContigAgp.pl allContigs.agp
     ## combine the two contig agp files
     cat allContigs.agp randomContigs.agp > mm9.contigs.agp
     ## separate the random contigs from the non-random contigs
     $HOME/kent/src/hg/mouseStuff/buildTools/sortRandoms.pl \
 	randomContigs.agp > mvRandoms.sh
     ## inspect mvRandoms.sh and then run it if it is OK
     chmod +x mvRandoms.sh
     ./mvRandoms.sh
     ## verify all contigs exist properly
     $HOME/kent/src/hg/mouseStuff/buildTools/checkContigs.pl mm9.contigs.agp
     ## create all contigs fasta file
     cd splitContigs
     find . -type f | xargs cat > ../mm9.contigs.fa
     ## create assembled sequence from these contigs and agp file
     cd ..
     agpToFa -simpleMulti mm9.contigs.agp all mm9.assembled.fa mm9.contigs.fa
     ## create fragments agp file
     cat chrOnly.agp randomFragments.agp > mm9.fragments.agp
     ## verify this agp too will work with the assembled fasta
     ## need 2bit file to avoid fasta file ordering difficulty
     faToTwoBit mm9.assembled.fa mm9.assembled.2bit
     checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
     ## it has a problem with chrY because it is supposed to end with:
 chrY    2902556 5902555 29      N       3000000 centromere      no
 chrY    5902556 15902555        30      N       10000000        contig  no
     ## edit mm9.contigs.agp to add these two lines, and repeat the agpToFa
     ## after that, this check fails on chrX_random
     ## this is supposed to be a gap, with N's
     ## chrX_random     300319  303472  46      N       3154  fragment  yes
 # Loop: chrX_random, dnaOffset=300318, seqSize=1785075
 # agpFrag->chromStart: 300318, agpFrag->chromEnd: 303472, dnaOffset: 300318
 # FASTA gap entry
 # Bad char a found at index 300349
 # Invalid Agp or Fasta file entry for sequence chrX_random
 # agpMatchesFaEntry failed; exiting
     ## this comes from the use of a single fragment in two parts,
     ## from allcontig.agp
 NT_165789.2     296206  300318  45      W       CAAA01187194.1  1       4113  +
 NT_165789.2     300319  300349  46      N       31      fragment        no      
 NT_165789.2     300350  303372  47      W       CAAA01187194.1  4145    7167
     ## which I processed into:
 chrX_random     296206  300318  45      W       CAAA01187194.1  1       4113  +
 chrX_random     300319  303472  46      N       3154    fragment        yes
     ## should have been
 chrX_random     296206  300318  45      W       CAAA01187194.1  1       4113  +
 chrX_random     300319  300349  46      N       31      fragment        yes
 chrX_random     300350  303372  47      W       CAAA01187194.1  4145    7167  +
 ### NCBI had this as a non-bridged fragment, a 'no' - I'm making it a yes
     ## so, edit the randomFragments.agp to fixup that line as indicated
     ## the chrOnly.agp file also needs an entry for chrM, add this
     ## line to chrOnly.agp:
 chrM    1       16299   1       F       NC_005089.1     1       16299   +
     ## now have successful business:
     checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
 # All AGP and FASTA entries agree - both files are valid
     ## let's get the sequence in order in the fasta file
     faSplit byname mm9.assembled.fa splitChr/
     cut -f1 mm9.fragments.agp | uniq -c
     ## using the order of this fragments.agp file
     for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y M \
 	13_random 16_random 17_random 1_random 3_random 4_random 5_random \
 	7_random 8_random 9_random Un_random X_random Y_random
 do
     cat splitChr/chr${C}.fa
 done > mm9.fragorder.assembled.fa
     ## now that fasta file should also be OK
     checkAgpAndFa mm9.fragments.agp mm9.fragorder.assembled.fa
 # All AGP and FASTA entries agree - both files are valid
     ## now ready to give this agp and fasta file off to makeGenomeDb.pl
 
     ## pre-release testing download sequence  ###############################
     mkdir ncbi
     cd ncbi
     cp -p /cluster/data/mm8/ncbi/.wgetrc .
     WGETRC=`pwd`/.wgetrc
     export WGETRC
 
     time nice -n +19 wget --timestamping --force-directories \
 	--directory-prefix=. --dont-remove-listing --recursive \
 	--level=4 --no-parent --no-host-directories --cut-dirs=1 \
 	ftp://ftp-private.ncbi.nih.gov/mouse_37
     #	Downloaded: 2,599,733,765 bytes in 196 files
 
     #	The pre-release sequence, April 5th:
     mkdir /cluster/data/mm9/pre_release
     cd /cluster/data/mm9/pre_release
     #	The .wgetrc is the anonymous user
     cat << '_EOF_' > .wgetrc
 login = anonymous
 passwd = <your email address>
 '_EOF_'
     # << happy emacs
     chmod 600 .wgetrc
     WGETRC=`pwd`/.wgetrc
     export WGETRC
     wget --timestamping --force-directories --directory-prefix=. \
 	--dont-remove-listing --recursive --level=4 --no-parent \
 	--no-host-directories --cut-dirs=3 \
 	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release
     ##	Ran a quick test build with that to see if it would work
 
     ### this procedure run for the pre_release and the mouse_37 sequence
     ### for pre_release the sed was:
     # zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>lcl|/>/; s/.fa.*//"
     mkdir chrNamesFixed
     for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
 do
     zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>gi.*/>chr${C}/" \
 	| gzip -c > chrNamesFixed/chr${C}.fa.gz
     echo chr${C} done
 done
     zcat chrM/mm_ref_chrMT.fa.gz | sed -e "s/^>gi.*/>chrM/" \
 	| gzip -c > chrNamesFixed/chrM.fa.gz
 
     ## later on, an error was discovered in the processing of chrY_random
     # a lot of gaps of size zero were inserted.  They didn't cause any
     # disruption to the assembly track, they only caused extra gap entries
     # that were useless.  So, to fixup, remove anything in the chrY_gap
     # table that has a size of zero:
     hgsql -e 'delete from chrY_random_gap where size<"1";' mm9
 
     ## And, fixing the one fragment on chrX_random
     hgsql -e 'INSERT chrX_random_gap VALUES("587", "chrX_random",
 "300318", "300349", "46", "N", "31", "fragment", "yes")' mm9
 
     hgsql -e 'DELETE from chrX_random_gold where chromStart="296205";' mm9
     hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
 "296205", "300318", "45", "W", "CAAA01187194.1", "0", "4113", "+")' mm9
     hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
 "300349", "303372", "45", "W", "CAAA01187194.1", "4144", "7167", "+")' mm9
 
 
 ##########################################################################
 ## final makeGenomeDb.pl (DONE - 2007-07-19 - Hiram)
     ## to make this go again, some things need to be removed or set-aside
 
     ssh hgwdev
     hgsql -e 'delete from dbDb where name="mm9";' hgcentraltest
     rm -fr /gbdb/mm9
 
     ssh kkstore06
     cd /cluster/data/mm9
     mv mm9.config.ra mm9.config.pre_release.ra
     mv bed bed.pre_release
     mv mm9.unmasked.2bit mm9.unmasked.2bit.pre_release
     mv mm9.agp mm9.agp.pre_release
     mv mm9.randoms.2bit mm9.randoms.2bit.pre_release
     mv mm9.rmsk.2bit mm9.rmsk.2bit.pre_release
     mv mm9.rmskTrf.2bit mm9.rmskTrf.2bit.pre_release
     rm mm9.2bit
     rm -fr ? ??
     mv dbDbInsert.sql dbDbInsert.sql.pre_release
     mv makeGenomeDb.out makeGenomeDb.out.pre_release
     mv chrom.lst chrom.lst.pre_release
     mv jkStuff jkStuff.pre_release
     ## ask cluster-admin to rename the existing mm9 db to be mm9prerelease
 
     cat << '_EOF_' > mm9.config.ra
 # Config parameters for makeGenomeDb.pl:
 db mm9
 scientificName Mus musculus
 commonName Mouse
 assemblyDate Jul. 2007
 assemblyLabel NCBI Build 37
 orderKey 121
 mitoAcc none
 fastaFiles /cluster/data/mm9/mouse_37/mm9.fragorder.assembled.fa
 agpFiles /cluster/data/mm9/mouse_37/mm9.fragments.agp
 # qualFiles /dev/null
 dbDbSpeciesDir mouse
 '_EOF_'
     # << happy emacs
     time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
     #	real    24m24.468s
 
     ssh hgwdev
     featureBits mm9 gold
 # 2620346158 bases of 2620346158 (100.000%) in intersection
     featureBits mm8 gold
 # 2567283971 bases of 2567283971 (100.000%) in intersection
     featureBits mm9 gap
 # 105419323 bases of 2620346158 (4.023%) in intersection
     featureBits mm8 gap
 # 97171117 bases of 2567283971 (3.785%) in intersection
 
     #	verify index is correct:
     hgsql mm9 -e "show index from gc5Base;"
     #	should see good numbers in Cardinality column
 
     #	Reset default position to be like Mm8
     hgsql -e \
 'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm9";' \
 	hgcentraltest
 
     # create initial symlink for 2bit sequence
     mkdir /gbdb/mm9
     mkdir /gbdb/mm9/html
     ln -s /cluster/data/mm9/mm9.unmasked.2bit /gbdb/mm9/mm9.2bit
 
     ## enter the trackDb business (was done in the pre-release test)
 
 ##########################################################################
 ## Initial pre-release makeGenomeDb.pl (DONE - 2007-04-05 - Hiram)
     ssh kkstore02
     cd /cluster/data/mm9
     cat << '_EOF_' > mm9.config.ra
 # Config parameters for makeGenomeDb.pl:
 db mm9
 scientificName Mus musculus
 commonName Mouse
 assemblyDate Apr. 2007
 assemblyLabel NCBI Build 37
 orderKey 121
 mitoAcc 33115104
 fastaFiles /cluster/data/mm9/pre_release/chrNamesFixed/chr*.fa.gz
 agpFiles /cluster/data/mm9/pre_release/chrOnly.agp
 # qualFiles /dev/null
 dbDbSpeciesDir mouse
 '_EOF_'
     # << happy emacs
     time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
     #	real    24m24.468s
 
 ##########################################################################
 ## Repeat masker (DONE - 2007-04-05 - Hiram)
 ## 	RE-DONE with final sequence 2007-07-19 - Hiram
     ssh kkstore06
     ## use screen for this
     mkdir /cluster/data/mm9/bed/RepeatMasker
     cd /cluster/data/mm9/bed/RepeatMasker
     time nice -n +19 doRepeatMasker.pl -bigClusterHub=kk \
 	-buildDir=/cluster/data/mm9/bed/RepeatMasker mm9 > do.out 2>&1 &
     #	real    1726m32.849s
 # Completed: 5467 of 5467 jobs
 # CPU time in finished jobs:   54774630s  912910.50m 15215.17h  633.97d  1.737 y
 # IO & Wait Time:                432302s    7205.04m   120.08h    5.00d  0.014 y
 # Average job time:               10098s     168.30m     2.81h    0.12d
 # Longest finished job:           20982s     349.70m     5.83h    0.24d
 # Submission to last job:        100294s    1671.57m    27.86h    1.16d
 
     ssh kkstore06
     cd /cluster/data/mm9
     twoBitToFa mm9.rmsk.2bit stdout | faSize stdin
 # 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
 #	1153701322 lower) in 35 sequences in 1 files
 # %42.33 masked total, %44.03 masked real
 
 ##############################################################################
 ## simpleRepeat masking (DONE - 2007-04-07 - Hiram)
 ##	RE-DONE with final sequence 2007-07-19 - Hiram
     ssh kolossus
     ## use screen for this
     mkdir /cluster/data/mm9/bed/simpleRepeat
     cd /cluster/data/mm9/bed/simpleRepeat
     time nice -n +19 twoBitToFa ../../mm9.unmasked.2bit stdout \
 	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
 		-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
     #	real    253m44.602s
     #	Appears to have an error on something:
 # sh: line 1: 18346 File size limit exceeded/cluster/bin/i386/trf /scratch/tmp/stdin_kolossus_3af1_fe9700.tf 2 7 7 80 10 50 2000 -m -d
 # Expecting 14 words line 4593 of /scratch/tmp/stdin_kolossus_3af1_fe9700.tf.2.7.7.80.10.50.2000.dat got 1
 
     #	Let's try running this on the kki kluster, by chrom
     ssh kkr1u00
     mkdir /iscratch/i/mus/mm9
     cd /iscratch/i/mus/mm9
     cp -p /cluster/data/mm9/mm9.unmasked.2bit .
     cp -p /cluster/data/mm9/chrom.sizes .
     cut -f1 chrom.sizes | while read C
 do
   twoBitToFa -noMask -seq=${C} mm9.unmasked.2bit stdout | gzip -c > ${C}.fa.gz
   echo ${C}
 done
 
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress /iscratch/i/mus/mm9/ kkr${R}u00:/iscratch/i/mus/mm9/
 done
 
     ssh kki
     mkdir /cluster/data/mm9/bed/simpleRepeat/trf
     cd /cluster/data/mm9/bed/simpleRepeat/trf
 
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe 
 #
 set C = $1
 set GZ = /iscratch/i/mus/mm9/$C.fa.gz
 mkdir -p /scratch/tmp/$C
 zcat $GZ > /scratch/tmp/$C/$C.fa
 pushd /scratch/tmp/$C
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \
 	/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
 popd
 rm -f $C.bed
 cp -p /scratch/tmp/$C/$C.bed .
 rm -fr /scratch/tmp/$C
 '_EOF_'
     # << happy emacs
     chmod +x runTrf
 
     cat << '_EOF_' > template
 #LOOP
 ./runTrf $(path1) {check out line $(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     cut -f1 /iscratch/i/mus/mm9/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     para try ... check ... push ... etc ...
     ## none of these jobs and any trouble, running line counts of these result
     ## bed files with the previous failed run indicates there are identical
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:      14620s     243.66m     4.06h    0.17d  0.000 y
 # IO & Wait Time:                   272s       4.54m     0.08h    0.00d  0.000 y
 # Average job time:                 425s       7.09m     0.12h    0.00d
 # Longest finished job:            1386s      23.10m     0.39h    0.02d
 # Submission to last job:          1790s      29.83m     0.50h    0.02d
 
     cat *.bed > ../simpleRepeat.bed
     cd ..
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/simpleRepeat
     time nice -n +19 hgLoadBed mm9 simpleRepeat \
       simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     #	Loaded 1167619 elements of size 16
     #	real    0m33.312s
 
     nice -n +19 featureBits mm9 simpleRepeat
     #	80054947 bases of 2620346158 (3.055%) in intersection
 
     ## clean up the /iscratch/i/mus/mm9/ directory
     ## for downloads:
     mkdir trfMaskChrom
     cd trfMaskChrom
     ln -s ../trf/chr*.bed .
 
 ###########################################################################
 # CREATE MICROSAT TRACK (DONE - 2007-07-20 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/microsat
     cd /cluster/data/mm9/bed/microsat
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
 	../simpleRepeat/simpleRepeat.bed > microsat.bed 
     hgLoadBed mm9 microsat microsat.bed
     #	Loaded 195688 elements of size 4
 
     featureBits mm9 microsat
 # 8713212 bases of 2620346158 (0.333%) in intersection
     featureBits mm8 microsat
 # 8570611 bases of 2567283971 (0.334%) in intersection
 
 #############################################################################
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2007-07-21 - Hiram)
 
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore06
     cd /cluster/data/mm9/bed/simpleRepeat
     mkdir trfMask
     for F in trf/chr*.bed
     do
 	echo "${F} -> ${F/trf\//}"
 	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
     done
 
     ## Add trfMask to repeat masked sequence
     ssh kkstore06
     cd /cluster/data/mm9
     cat  << '_EOF_' > addTrf.csh
 #!/bin/csh -efx
 # This script will fail if any of its commands fail.
 
 set DB = mm9
 set WORK_DIR = /cluster/data/${DB}
 cd ${WORK_DIR}
 set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
 set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
 cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
         | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
 twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
 '_EOF_'
     # << happy emacs
     chmod +x ./addTrf.csh
     time ./addTrf.csh
     cat faSize.mm9.rmskTrf.txt
 # 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
 #	1155308080 lower) in 35 sequences in 1 files
 # %42.38 masked total, %44.09 masked real
 
     ln -s mm9.rmskTrf.2bit mm9.2bit
     # fixup /gbdb/mm9/mm9.2bit symlink to this newly masked sequence
 
     ## copy to san for genbank kluster run
     cd /cluster/data/mm9
     cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2007-04-09 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm9", "blat14", "17790", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm9", "blat14", "17791", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ########################################################################
 ##  CYTOBAND - ideogram track (DONE - 2007-08-15 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/cytoBand
     cd /cluster/data/mm9/bed/cytoBand
 
     # Create bed file
     # (this script fixed up to eliminate one of the lines from ideogram file)
     $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ../../mouse_37/ideogram
     ### doesn't work, the ideogram file is corrupted, use the one fetched below
     ## as so:
     $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ideogram
     ## can now verify before load:
     $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
     #	everything checks out OK on 21 chroms
     # Load the bed file
     hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
 	mm9 cytoBand cytoBand.bed
     # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
     hgsql -e "drop table cytoBandIdeo;" mm9
     hgsql mm9 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"
 
     ## fetch updated ideogram.gz file that has been fixed by NCBI
     NCBI=ftp://ftp.ncbi.nih.gov/genomes
     MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/updates
     wget  --dont-remove-listing --timestamping \
         "${NCBI}/${MAPVIEW}/ideogram.gz" -O ideogram.gz
     ## run through the createNcbiCytoBand.pl process above, and then load
     ## can now verify before load:
     $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
     #	everything checks out OK on 21 chroms
 
 ##########################################################################
 ## GENBANK alignments (DONE - 2007-08-03 - Hiram)
     ## next time:  don't forget to make the 11.ooc file, see below
     ## generate a lift file that specifies segments separated by non-bridged
     ## gaps
     ## make the ooc file
     ssh kolossus
     cd /cluster/data/mm9
     time blat mm9.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
     #	real    2m29.455s
     cp -p 11.ooc /san/sanvol1/scratch/mm9
     cp -p 11.ooc jkStuff
     ## also setup /iscratch/i/mus/mm9/ with these files for
     ## other kluster runs:
     #	-rw-rw-r--  1 712923274 Jul 21 13:31 mm9.2bit
     #	-rw-rw-r--  1     17179 Jul 23 16:18 nonBridgedGap.lft
     #	-rw-rw-r--  1    122352 Jul 24 11:32 11.ooc
 
     ssh hgwdev
     cd /cluster/data/mm9/jkStuff
     gapToLift mm9 nonBridgedGap.lft
 # WARNING: gap at end of chromosome at chrY:5902555-15902555
 # WARNING: overlapping gap at chrY:2902555-5902555 and chrY:5902555-15902555
     ## These warnings are true, chrY has two gaps next to each other, and
     ## the second one is actually the end of the chrom.  This is the way the
     ## NCBI supplied AGP file is.  (this seems to be normal in hg18 too ...)
     cp -p nonBridgedGap.lft /san/sanvol1/scratch/mm9
     cd ..
     cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit
     ## The genbank.conf entry looks like:
 # mm9
 mm9.serverGenome = /cluster/data/mm9/mm9.2bit
 mm9.clusterGenome = /san/sanvol1/scratch/mm9/mm9.2bit
 mm9.ooc = /cluster/data/mm9/11.ooc
 mm9.align.unplacedChroms = *
 mm9.lift = /cluster/data/mm9/jkStuff/nonBridgedGap.lft
 mm9.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 mm9.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 mm9.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 mm9.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 mm9.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 mm9.downloadDir = mm9
 mm9.refseq.mrna.xeno.load  = yes
 mm9.refseq.mrna.xeno.loadDesc = yes
 mm9.mgcTables.default = full
 mm9.mgcTables.mgc = all
 
     ssh kkstore02
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial mm9 &
     ##	var/build/logs/2007.07.26-21:57:22.mm9.initalign.log
 
     ## logFile: var/build/logs/2007.07.23-16:44:31.mm9.initalign.log
     #	real    771m12.978s
     #  a couple of failed jobs, finish off the align step manually
     ssh kk
     cd /cluster/bluearc/genbank/work/initial.mm9/align
     para time
 # Completed: 50580 of 50580 jobs
 # CPU time in finished jobs:   14556484s  242608.06m  4043.47h  168.48d  0.462 y
 # IO & Wait Time:                988518s   16475.30m   274.59h   11.44d  0.031 y
 # Average job time:                 307s       5.12m     0.09h    0.00d
 # Longest finished job:            1815s      30.25m     0.50h    0.02d
 # Submission to last job:         40513s     675.22m    11.25h    0.47d
 
     ## after recovery of the alignments jobs
     ssh kkstore02
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -continue=finish -initial mm9 &
     #	var/build/logs/2007.07.27-11:02:00.mm9.initalign.log
     #	real    169m53.124s
 
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm9
     #	var/dbload/hgwdev/logs/2007.07.27-14:10:22.dbload.log
     #	real    54m55.707s
 
     ## the two measurements are for two different runs of genbank,
     ## once configured as "ordered" secondly configured as "finished"
     featureBits mm9 refGene:cds
     #	30105171 bases of 2620346127 (1.149%) in intersection
     #	30113840 bases of 2620346127 (1.149%) in intersection
     featureBits mm9 refGene
     #	51164928 bases of 2620346127 (1.953%) in intersection
     #	51175624 bases of 2620346127 (1.953%) in intersection
     featureBits mm9 mrna
     #	135379415 bases of 2620346127 (5.166%) in intersection
     #	137195240 bases of 2620346127 (5.236%) in intersection
     featureBits mm9 mgcGenes
     #	33676155 bases of 2620346127 (1.285%) in intersection
     #	34012201 bases of 2620346127 (1.298%) in intersection
     featureBits mm9 est
     #	184121510 bases of 2620346127 (7.027%) in intersection
     #	188799620 bases of 2620346127 (7.205%) in intersection
     featureBits mm9 intronEst
     #	52305179 bases of 2620346127 (1.996%) in intersection
     #	52812173 bases of 2620346127 (2.015%) in intersection
     featureBits mm9 xenoMrna
     #	46119254 bases of 2620346127 (1.760%) in intersection
     #	51438566 bases of 2620346127 (1.963%) in intersection
     featureBits mm9 xenoRefGene
     #	40378885 bases of 2620346127 (1.541%) in intersection
     #	44298281 bases of 2620346127 (1.691%) in intersection
 
     # enable daily alignment and update of hgwdev (DONE - 2007-08-03 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add mm9 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added mm9 - Mus musculus" etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 #########################################################################
 # MAP CONTIGS TRACK (DONE - 2007-07-23 - Hiram)
     ## can take contig information directly from previously created
     ## mm9.contigs.agp
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/ctgPos
     cd /cluster/data/mm9/bed/ctgPos
     grep CONTIG ../../mouse_37/mm9.contigs.agp \
 	| awk '{printf "%s\t%d\t%s\t%d\t%d\n", $6, $8, $1, $2-1, $3}' \
 	> mm9.ctgPos.tab
 
     hgsql mm9 < ~/kent/src/hg/lib/ctgPos.sql
     hgsql mm9 -e 'load data local infile "mm9.ctgPos.tab" into table ctgPos;'
 
     featureBits -countGaps mm9 ctgPos
     #	2623952781 bases of 2725765481 (96.265%) in intersection
     featureBits -countGaps mm8 ctgPos
     #	2573322222 bases of 2664455088 (96.580%) in intersection
 
 #########################################################################
 ## Create downloads directory (DONE - 2007-07-25 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
     cd /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
     ln -s ../trf/chr*.bed .
 
     cd /cluster/data/mm9
     time nice -n +19 /cluster/bin/scripts/makeDownloads.pl mm9 \
 	> do.downloads.out 2>&1
     #	real    41m18.282s
     ## failed during jkStuff/doInstall.csh:
 # foreach size ( 1000 2000 5000 )
 # echo 1000
 # featureBits mm9 refGene:upstream:1000 -fa=stdout
 # setpriority: Permission denied.
 # Error writing 50 bytes: Operation not permitted
     ## remove the "nice" statements from the csh, and finish it off
     ## edit the README files to indicate correct information
 
 
 ##########################################################################
 # MGI LIFTOVER FROM MM8 (DONE 2007-07-26 angie)
     ssh kolossus
     mkdir /cluster/data/mm9/bed/jaxLiftOver
     cd /cluster/data/mm9/bed/jaxLiftOver
     ldHgGene -out=stdout -nobin placeholder placeholder \
       /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscript.gff \
     | liftOver stdin -minBlocks=0.5 \
       /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
       -genePred jaxRepTranscriptLift.{gp,unmapped}
 #Read 31587 transcripts in 232925 lines in 1 files
     wc -l jaxRepTranscriptLift.{gp,unmapped}
 #  31470 jaxRepTranscriptLift.gp
 #    234 jaxRepTranscriptLift.unmapped
     liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxAllele.bed \
       /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
       -bedPlus=12 jaxAlleleLift.{bed,unmapped}
     wc -l jaxAlleleLift.{bed,unmapped}
 #  12372 jaxAlleleLift.bed
 #      2 jaxAlleleLift.unmapped
     liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxPhenotype.bed \
       /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
       -bedPlus=12 -tab jaxPhenotypeLift.{bed,unmapped}
     wc -l jaxPhenotypeLift.{bed,unmapped}
 #  23806 jaxPhenotypeLift.bed
 #      0 jaxPhenotypeLift.unmapped
     liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxQtl.bed \
       /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
       -bedPlus=6 -tab jaxQtlLift.{bed,unmapped}
     wc -l jaxQtlLift.{bed,unmapped}
 #  1539 jaxQtlLift.bed
 #    12 jaxQtlLift.unmapped
 
     # Load lifted track tables and original auxiliary tables:
     ssh hgwdev
     cd /cluster/data/mm8/bed/jaxLiftOver
     # jaxRepTranscriptLift
     ldHgGene -predTab mm9 jaxRepTranscriptLift jaxRepTranscriptLift.gp
 #31470 gene predictions
     sed -e 's/jaxRepTranscript/jaxRepTranscriptLift/g' \
       /cluster/data/mm8/bed/jax/2007_07/fixJaxRepTranscript.sql \
       > fixJaxRepTranscriptLift.sql
     hgsql mm9 < fixJaxRepTranscriptLift.sql
     hgLoadSqlTab mm9 jaxRepTranscriptAlias \
       /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.sql \
       /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.tab
     hgsql mm9 -e 'rename table jaxRepTranscriptAlias to jaxRepTranscriptLiftAlias;'
     # jaxAlleleLift
     sed -e 's/bed12Source/jaxAlleleLift/g' \
       $HOME/kent/src/hg/lib/bed12Source.sql > jaxAlleleLift.sql
     hgLoadBed -sqlTable=jaxAlleleLift.sql mm9 jaxAlleleLift jaxAlleleLift.bed
 #Loaded 12372 elements of size 13
     sed -e 's/jaxAllele/jaxAlleleLift/g' \
       /cluster/data/mm8/bed/jax/2007_07/fixJaxAllele.sql > fixJaxAlleleLift.sql
     # empty file, but just in case it has something in the future...
     hgsql mm9 < fixJaxAlleleLift.sql
     hgLoadSqlTab mm9 jaxAlleleInfo \
       ~/kent/src/hg/lib/jaxAlleleInfo.sql \
       /cluster/data/mm8/bed/jax/2007_07/jaxAlleleInfo.tab
     # jaxPhenotypeLift
     sed -e 's/bed12Source/jaxPhenotypeLift/g' \
       ~/kent/src/hg/lib/bed12Source.sql > jaxPhenotypeLift.sql
     hgLoadBed -tab -sqlTable=jaxPhenotypeLift.sql mm9 jaxPhenotypeLift \
       jaxPhenotypeLift.bed
 #Loaded 23806 elements of size 13
     sed -e 's/jaxPhenotype/jaxPhenotypeLift/g' \
       /cluster/data/mm8/bed/jax/2007_07/fixJaxPhenotype.sql \
       > fixJaxPhenotypeLift.sql
     # empty file, but just in case it has something in the future...
     hgsql mm9 < fixJaxPhenotypeLift.sql
     hgLoadSqlTab mm9 jaxPhenotypeAlias \
       /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.sql \
       /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.tab
     hgsql mm9 -e 'rename table jaxPhenotypeAlias to jaxPhenotypeLiftAlias;'
     # phenotype-allele relationships
     hgLoadSqlTab mm9 jaxAllelePheno \
       ~/kent/src/hg/lib/jaxAllelePheno.sql \
       /cluster/data/mm8/bed/jax/2007_07/jaxAllelePheno.tab
     # jaxQTLLift
     sed -e 's/jaxQTL/jaxQTLLift/g'\
       ~/kent/src/hg/lib/jaxQTL.sql  > jaxQTLLift.sql
     hgLoadBed -tab -notItemRgb -noBin \
       -sqlTable=jaxQTLLift.sql \
       mm9 jaxQTLLift jaxQtlLift.bed
 #Loaded 1539 elements of size 10
     # Add row to mm9.grp for Phenotype and Allele track group:
     hgsql mm9 -e 'insert into grp values("phenoAllele", "Phenotype and Allele", 4.5);'
 
 
 ##########################################################################
 ## Creating pushQ (DONE - 2007-07-26 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/pushQ
     cd /cluster/data/mm9/pushQ
     /cluster/bin/scripts/makePushQSql.pl mm9 > mm9.sql 2> stderr.out
     ## check the stderr.out for anything that needs to be fixed
     ## copy mm9.sql to hgwbeta:/tmp
     scp mm9.sql hgwbeta:/tmp
     ## then on hgwbeta
     ssh hgwbeta
     cd /tmp
     hgsql qapushq < mm9.sql
 
 #############################################################################
 # STS MARKERS DATA DOWNLOAD (DONE - 2007-07-26 - Hiram)
     ssh kkstore06
     mkdir -p /cluster/data/mm9/bed/STSmarkers/downloads
     cd /cluster/data/mm9/bed/STSmarkers/downloads
     # these files appear to be new almost every day
     time nice -n +19 wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
     time nice -n +19 wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
 
     #	The new feature in the .aliases file this time are names with
     #	spaces in them !  This changes our parsing business below,
     #	hopefully the spaces in the names won't cause trouble elsewhere.
 
     time nice -n +19 wget --timestamping \
 ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
 
     # these reports from jax.org appear to be changing daily
     time nice -n +19 wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
     time nice -n +19 wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
     time nice -n +19 wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
     ls -ogrt
 #	-rw-rw-r--  1      676 Mar 11  2004 README
 #	-rw-rw-r--  1   396858 Jan 28  2005 10090.MGI.txt
 #	-rw-rw-r--  1   390139 Mar 16  2005 10090.WI_MRC_RH.txt
 #	-rw-rw-r--  1   240688 Mar 16  2005 10090.WI-YAC.txt
 #	-rw-rw-r--  1   173344 Mar 16  2005 10090.WI-Genetic.txt
 #	-rw-rw-r--  1 25691253 Jan 13  2006 UniSTS.aliases
 #	-rw-rw-r--  1  4582158 Jul  5 11:40 UniSTS_mouse.sts
 #	-rw-rw-r--  1  2841773 Jul 26 03:13 PRB_PrimerSeq.rpt
 #	-rw-rw-r--  1  5149790 Jul 26 03:13 MRK_Sequence.rpt
 #	-rw-rw-r--  1  5697140 Jul 26 03:13 MRK_Dump2.rpt
 
     #	 I note the UniSTS.aliases file is over twice as big as was in
     #	 Mm7 build.  I wonder what got into it ...
     #	What got into it was that it was completely broken.  It appeared
     #	to have a vast section of itself duplicated again in the file.
     #	It was cleaned up via:
     echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
     grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
     mv UniSTS.aliases UniSTS.aliases.broken
     mv uniqueSTS.aliases UniSTS.aliases
 
     # back to our work area, update the bed file
     #	to do this we need a new UniSTS_mouse.alias file
     # it is created by a combination of information from several
     # of the above files ! AND ! the previous stsInfoMouse.bed file
     # the db reference here is to the previous build
     time nice -n +19 ~/kent/src/hg/stsMarkers/fetchAllAliases.csh mm8
 
     #	Here is a normal set of errors:
 # processing UniSTS_mouse.sts to find aliases
 # #       ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
 # #       2384
 # processing MGI.aliases
 # fetching existing aliases from previous stsInfoMouse.bed file
 # found 27648 potential errors in
 #	/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed
 # to see the errors: grep ERROR stsInfoAliases.txt
 # verify those stsInfoMouse.bed aliases with UniSTS.aliases
 
     #	those errors in the previous stsInfoMouse.bed file are an
     #	accumulation of errors from a long long time ago in this chain
     #	of processing.  Some day it might be nice to fix them, but they
     #	don't seem to bother anything, so they continue to be carried
     #	forward, and a couple of new ones are added with each assembly.
 
 ####################################################################
 ##  STS markers data processing track (DONE - 2007-07-26 - Hiram)
     ssh hgwdev
     cd /cluster/data/mm9/bed/STSmarkers
     # create a new stsInfoMouse.bed file:
     #	Update the m m 8 directory name here to m m 9
     #	for the next build of m m 10,  ...etc... and so forth
     time ~/kent/src/hg/stsMarkers/updateBed.pl \
         /cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed \
         downloads/MRK_Dump2.rpt \
 	downloads/PRB_PrimerSeq.rpt \
         downloads/MRK_Sequence.rpt \
 	downloads/UniSTS_mouse.alias \
         downloads/UniSTS_mouse.sts \
         -g downloads/10090.WI-Genetic.txt \
         -r downloads/10090.WI_MRC_RH.txt \
         -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile
 
     ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
         | sed -e "s/\t*$//" > mm9.stsInfoMouse.bed
 
     # copy the stsInfoMouse.bed file from working dir to the marker
     #	info storage fold.  added 2 new steps by Yontao	
     #	be wary of the archive name here, check the directory and get
     #	the name right here.
     mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
     cp -p mm9.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
 
     # comparing to previous, numbers increase slightly each time
     wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
     #	66782 /cluster/store5/mouseMarker/stsInfoMouse.bed
     #	60631 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
     #	59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
     #	58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
     #	58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
 
     # and from that, create new primer fa, epcr, etc:
     time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
 	mm9.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
     # the mouseC.fa file will be empty, should be more than last time
     wc -l mouse?.*
 
     #       0 mouseC.fa
     #  359647 mouseP.fa
     #   41247 mouseP.info
 
     #	the equivalent Mm8 files:
     #	     0 mouseC.fa
     #	308384 mouseP.fa
     #	 34666 mouseP.info
 
     #	copy the primers over to some filesystem close to the klusters
     #	and split them up to have a small number of sequences in one file
 
     mkdir /cluster/bluearc/mm9/stsMarkers
     cp -p mouseP.fa /cluster/bluearc/mm9/stsMarkers
     cd /cluster/bluearc/mm9/stsMarkers
     cp -p /cluster/data/mm9/11.ooc .
     mkdir split
     #	356 files for 41,247 sequences, == about 116 sequences per file
     faSplit sequence mouseP.fa 400 split/mm_
 
     # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
     #	This process could convert to a modern version of blat with the
     #	filters as described, for example, in the STS markers build in Hg18
 
     #  CLUSTER RUN FOR THE STS PRIMERS
     ssh kk
     cd /cluster/data/mm9/bed/STSmarkers
     mkdir primer
     mkdir ePCR
     cd primer
     mkdir out
 
     #	interestingly, this blat2.2 binary did not function correctly
     #	when given nib files.  It has only about 1/4th of the number of
     #	alignments as it gets when it used fa files for the target
     #	sequence.
 
     ls -1S /cluster/bluearc/mm9/stsMarkers/split > primers.list
     #	will fetch chrom sequences from the 2bit file
     cut -f1 /cluster/data/mm9/chrom.sizes > chr.list
 
     ## next time, make this script produce its results in /scratch/tmp
     ## then move result file to output instead of writing result
     ## to output
     cat << '_EOF_' > runBlat2
 #!/bin/csh -fe
 set primer = /cluster/bluearc/mm9/stsMarkers/split/$1
 set root1 = $1:r
 set fa = $root1.$2.fa
 set ooc = /cluster/bluearc/mm9/stsMarkers/11.ooc
 set root2 = $2:r
 set tmpDir = /scratch/tmp/$root1.$root2
 mkdir $tmpDir
 mkdir -p out/${root2}
 set out = $3
 pushd $tmpDir
 twoBitToFa -seq=$2 /iscratch/i/mus/mm9/mm9.2bit ${fa}
 cp -p ${primer} primer.fa
 cp -p ${ooc} 11.ooc
 
 /cluster/bin/i386/blat.2 ${fa} primer.fa -ooc=11.ooc \
         -minMatch=1 -minScore=0 -minIdentity=80 -oneOff result.psl
 popd
 cp -p ${tmpDir}/result.psl ${out}
 rm -fr ${tmpDir}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat2
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat2 $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 primers.list chr.list template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 12425 of 12425 jobs
 # CPU time in finished jobs:    1438098s   23968.31m   399.47h   16.64d  0.046 y
 # IO & Wait Time:                237582s    3959.69m    65.99h    2.75d  0.008 y
 # Average job time:                 135s       2.25m     0.04h    0.00d
 # Longest finished job:            2150s      35.83m     0.60h    0.02d
 # Submission to last job:          4736s      78.93m     1.32h    0.05d
 
     # on the file server
     ssh kkstore06
     cd /cluster/data/mm9/bed/STSmarkers/primer
     time nice -n +19 pslSort dirs primers.raw.psl temp out/chr*
     #	real    1m34.193s
     #	-rw-rw-r--   1 700293557 Aug  6 10:22 primers.raw.psl
 
     #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
     #	should not be more than 100 bases different.
     #	This filters out about 948,260 alignments, or
     #	%17.4 = 100.0 * 948260 / 5462936
     time nice -n +19 pslSort dirs stdout temp out/chr* | awk -F"\t" '
 { if (((($13 - $12) - ($17 - $16)) > -100) &&
 	((($13 - $12) - ($17 - $16)) < 100)) {print}
 }
 ' > primers.100.psl
 
     rmdir temp
 
     wc -l *.psl
     #	5340677 primers.100.psl
     #	6498150 primers.raw.psl
 
     echo "6498150-5340677" | bc -q
     #	1157473 difference
 
     # a rough comparison with previous results:
 
     wc -l primers.100.psl \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100
 # 5340677 primers.100.psl
 # 4514676 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100
 
     # another kluster run for the ePCR
     ssh pk
     cd /cluster/data/mm9/bed/STSmarkers/ePCR
     cut -f1 /cluster/data/mm9/chrom.sizes > chr.list
 
     #	Using previously fetched e-PCR source from
     #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
     #	version 2.3.1 11 Feb 2005
     #	Had to add the following to both re-PCR_main.cpp and
     #	e-PCR_main.cpp to get them to compile on kolossus:
 // max and min Copied from /usr/include/mysql/my_global.h
 #define max(a, b)       ((a) >? (b))
 #define min(a, b)       ((a) <? (b))
 
     mkdir out
     cat << '_EOF_' > runPCR
 #!/bin/csh -fe
 set chr = $1
 set out = $2
 set wrkdir = /scratch/tmp/epcr.mm9.$chr
 set fa = $chr.fa
 set tmpResult = $chr.result.epcr
 mkdir $wrkdir
 twoBitToFa -seq=$chr /san/sanvol1/scratch/mm9/mm9.2bit $wrkdir/$fa
 pushd $wrkdir
 /cluster/bin/x86_64/e-PCR \
     /cluster/data/mm9/bed/STSmarkers/mouseP.info $fa N=1 M=50 W=5 > $tmpResult
 popd
 cp -p $wrkdir/$tmpResult $out
 rm $wrkdir/$tmpResult
 rm $wrkdir/$fa
 rmdir $wrkdir
 '_EOF_'
     # << happy emacs
     chmod +x runPCR
 
     cat << '_EOF_' > template
 #LOOP
 ./runPCR $(path1) {check out line+ out/$(root1).epcr}
 #ENDLOOP
 '_EOF_'
     # << the mouseP.info was created above
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check
     para push
     ... etc ...
     ## two of those produce zero results:
     #	-rw-rw-r--  1      0 Aug  6 12:53 chr3_random.epcr
     #	-rw-rw-r--  1      0 Aug  6 12:53 chr16_random.epcr
     ## hence, the two crashed jobs in the check display:
 # Completed: 33 of 35 jobs
 # Crashed: 2 jobs
 # CPU time in finished jobs:      80940s    1349.01m    22.48h    0.94d  0.003 y
 # IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 # Average job time:                2327s      38.78m     0.65h    0.03d
 # Longest finished job:            6980s     116.33m     1.94h    0.08d
 # Submission to last job:         15589s     259.82m     4.33h    0.18d
 
     ssh kkstore06
     cd /cluster/data/mm9/bed/STSmarkers/ePCR
     # all those results become all.epcr
     cat out/*.epcr > all.epcr
 
     # comparing to previous results, should have more with new results:
     wc -l all.epcr /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr
     #	87623 all.epcr
     #	58162 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr
 
     cd /cluster/data/mm9/bed/STSmarkers/primer
 
     ~/kent/src/hg/stsMarkers/filterSTSPrimers \
     -mouse ../mm9.stsInfoMouse.bed primers.100.psl \
         ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
 
     #  The output should show an increasing count:
     #	Reading name info from: ../mm9.stsInfoMouse.bed
     #	Reading primer info from: ../mouseP.info
     #	Reading ePCR info from: ../ePCR/all.epcr
     #	Reading alignment results from: primers.100.psl
     #	100000
     #	200000
     #	...
     #	5200000
     #	5300000
     #	Determining ePCR not found from ePCR results
     #	Out of 26332 ePCR alignments examined, not found: 527
 
     ## compare with previous build results
     wc -l primers.psl.filter.blat \
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
 # 35537 primers.psl.filter.blat
 # 34043 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
 
     ## ouch, mm9 does not have lift files for contigs to chroms
     ## let's make a contig lift file
     cd /cluster/data/mm9/jkStuff
     cp -p /cluster/data/cb3/jkStuff/agpToLift.pl .
     grep CONTIG ../mouse_37/mm9.contigs.agp \
 	| ./agpToLift.pl /dev/stdin > mm9.contigs.lift
     awk '{if (! match($5,"N")) print}' ../mouse_37/mm9.fragments.agp \
 	| /cluster/data/rn3/jkStuff/agpToLift.pl ../chrom.sizes /dev/stdin \
 	> mm9.fragments.lift
     cd ..
     mkdir ctgLifts
     splitFileByColumn -col=4 jkStuff/mm9.contigs.lift ctgLifts
     mkdir fragmentLifts
     splitFileByColumn -col=4 jkStuff/mm9.fragments.lift fragmentLifts
     
     ## distribute those in the old-style lift directory hierarchy
     for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
 do
     rm -fr  ${C}/lift
 done
 
     for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
 do
     mkdir -p ${C}/lift
     if [ -f ctgLifts/chr${C}.contigs.lift ]; then
 	cp -p ctgLifts/chr${C}.contigs.lift ${C}/lift/ordered.lft
     fi
     if [ -f ctgLifts/chr${C}_random.contigs.lift ]; then
 	cp -p ctgLifts/chr${C}_random.contigs.lift ${C}/lift/random.lft
     fi
 done
     ## not the fragments
 #    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
 #do
 #    mkdir -p ${C}/lift
 #    if [ -f fragmentLifts/chr${C}.fragments.lift ]; then
 #	cp -p fragmentLifts/chr${C}.fragments.lift ${C}/lift/ordered.lft
 #    fi
 #    if [ -f fragmentLifts/chr${C}_random.fragments.lift ]; then
 #	cp -p fragmentLifts/chr${C}_random.fragments.lift ${C}/lift/random.lft
 #    fi
 #done
 
     ## now, after that side trip, back to the primer business
     # create file accession_info.rdb
     touch empty_sequence.inf
     ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
 	/cluster/data/mm9 empty_sequence.inf
     #	20363 processed
     mv accession_info.rdb accession_info.rdb.tmp
     ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
 	< accession_info.rdb.tmp > accession_info.rdb
     #	The -x prints the debug statement:
     #	sort arg:  -t"  " +0 -1 +1 -2g +2 -3g
     rm accession_info.rdb.tmp
 
     # comparing results to previous
     #	Continuing the trend that began with Mm7, the numbers in
     #	accession_info.rdb continue to decrease.  Even Mm8 has much less
     #	fragments than did mm7:
     #	e.g.:
     [hiram@kkstore06 /cluster/data] wc -l mm9/?/chr*.agp mm9/??/chr*.agp | tail -1
     #	21699 total
     [hiram@kkstore06 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
     #	21910 total
     [hiram@kkstore06 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
     #	70125 total
     [hiram@kkstore06 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
     #	170812 total
 
     wc -l accession_info.rdb \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
     #	20333 accession_info.rdb
     #	20385 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb
 
     # creates epcr.not.found.nomatch and epcr.not.found.psl
     ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
 	epcr.not.found ../mouseP.info \
 	accession_info.rdb /cluster/data/mm9/mm9.2bit 2> dbg.epcrToPsl
     #	the dbg.epcrToPsl has a number of lines complaining about bad
     #	primers in ../mouseP.info - and indeed they are bad primers,
     #	they do not have a second primer.
 
     # Comparing results to previous:
     wc -l epcr* \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr*
 # 527 epcr.not.found
 # 0 epcr.not.found.nomatch
 # 527 epcr.not.found.psl
 # 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found
 # 0 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.nomatch
 # 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.psl
 
     # Mm7 wc epcr*
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
     #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
     #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
     #	1106 total
 
     cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
     wc -l primers.psl.filter \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
     #	36064 primers.psl.filter
     #	34563 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
     # create primers.psl.filter.lifted.initial
     #	The PATH setting allows extractPslInfo to find other programs that it
     #	is going to use.
     PATH=~/kent/src/hg/stsMarkers:$PATH \
 	~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter
 
     wc -l *.initial \
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial
 # 36040 primers.psl.filter.initial
 # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial
 
     # create primers.psl.filter.lifted.initial.acc
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/findAccession -agp \
 	-mouse primers.psl.filter.initial /cluster/data/mm9
     wc -l *.initial.acc /cluster/data/mm8/bed\
 /STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc
 # 36040 primers.psl.filter.initial.acc
 # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc
 
 
     # this needs to be -rat as that specifies how to scan the
     # stsInfoMouse.bed file and it does not work if you use -mouse
     # it is not clear what -mouse would mean to this script, some other file
     # format perhaps from the stsInfoMouse.bed format.
     ~/kent/src/hg/stsMarkers/getStsId -rat \
 	../mm9.stsInfoMouse.bed  primers.psl.filter.initial.acc \
 	| sort -k4,4n > primers.final
     wc -l primers.final \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final
 # 36040 primers.final
 # 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final
 
     cd /cluster/data/mm9/bed/STSmarkers
     # stsMarkers.final is empty for mouse
     touch stsMarkers.final dummy
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
 	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
     wc -l stsMarkers_pos.rdb \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb
 # 34232 stsMarkers_pos.rdb
 # 33048 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb
 
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/createStsBed \
 	mm9.stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
 	| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
     #	The sed removes unneeded blanks
     #	verify score profile remains similar
     awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
     #	   591 500
     #	  1774 750
     #	 28529 1000
     awk -F'\t' '{print $5}' \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed \
 	| sort -n | uniq -c
     #	  546 500
     #	 1650 750
     #	27705 1000
 
     wc -l stsMapMouse.bed \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
 # 30894 stsMapMouse.bed
 # 29901 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
     ## check the names, look for odd ones
     ##  the bogus names "-" were fixed for mm9
     awk -F'\t' '{print $4}' stsMapMouse.bed | sort | head
     awk -F'\t' '{print $4}' stsMapMouse.bed | sort | tail
 
     #  loading STS markers tables
     ssh hgwdev
     cd /cluster/data/mm9/bed/STSmarkers
     ~/kent/src/hg/stsMarkers/ucscAlias.pl \
 	mm9.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
     #	this does leave messages in ucscStsAlias.warnings but they seem
     #	to be very similar to Mm6 with just a few new ones
      
     wc -l ucscStsAlias.tab \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab
 # 146359 ucscStsAlias.tab
 # 146767 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab
 
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/STSmarkers
     ## when reloading:
     hgsql -e "drop table stsAlias;" mm9
     hgsql -e "drop table stsMapMouseNew;" mm9
     hgsql -e "drop table stsInfoMouseNew;" mm9
 
     hgsql mm9 < ~/kent/src/hg/lib/stsAlias.sql
     hgsql -e \
 	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm9
     hgsql mm9 < ~/kent/src/hg/lib/stsMapMouseNew.sql
     hgsql -e \
 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm9
     hgsql mm9 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
     hgsql -e \
      'load data local infile "mm9.stsInfoMouse.bed" into table stsInfoMouseNew;' mm9
 
     hgsql -e "drop table all_sts_primer;" mm9
     hgLoadPsl -nobin -table=all_sts_primer mm9 primer/primers.psl.filter
 # load of all_sts_primer did not go as planned: 36064 record(s),
 #	0 row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
     #	After warnings, checkTableCoords to find problems:
     checkTableCoords -verboseBlocks mm9 all_sts_primer
 mm9.all_sts_primer item 61999 chr10:62485403-62485439: blocks 0 and 1 overlap.
 mm9.all_sts_primer has 1 records with overlapping blocks.
     #	Strip the offending item from the load
     hgsql -e 'delete from all_sts_primer where tName="chr10" AND tStart=62485403 AND tEnd=62485439;' mm9
 
     # load primer sequences	
     mkdir /gbdb/mm9/stsMarker
     ln -s /cluster/data/mm9/bed/STSmarkers/mouseP.fa \
 	/gbdb/mm9/stsMarker/mouseP.fa
     # PLEASE NOTE THAT THE If you are going to reload this business, use the
     #	-replace option on this hgLoadSeq
     #	hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa
     # otherwise there will be a problem that the seq and extFile tables 
     # will be out of sync. 
     hgLoadSeq -replace  mm9 /gbdb/mm9/stsMarker/mouseP.fa
     #  Adding /gbdb/mm9/stsMarker/mouseP.fa
     #	41247 sequences
     #	Warning: load of seq did not go as planned: 41330 record(s), 0 row(s)
     #	skipped, 1 warning(s) loading ./seq.tab
 
     ## joinerCheck should be clean:
     joinerCheck -keys -identifier=mouseStsTrueName -database=mm9 all.joiner
 # Checking keys on database mm9
 #  mm9.stsAlias.trueName - hits 146350 of 146359 ok
 #  mm9.all_sts_primer.qName - hits 35537 of 36063 ok
 #  mm9.stsMapMouseNew.name - hits 30894 of 30894 ok
 
     featureBits mm9 all_sts_primer
     #	3795229 bases of 2620346127 (0.145%) in intersection
     featureBits mm8 all_sts_primer
     #	3700897 bases of 2567283971 (0.144%) in intersection
     featureBits mm9 stsMapMouseNew
     #	4884563 bases of 2620346127 (0.186%) in intersection
     featureBits mm8 stsMapMouseNew
     #	4812616 bases of 2567283971 (0.187%) in intersection
 
     hgsql -N mm9 -e "select count(*) from stsAlias;"
     #	146359
     hgsql -N mm8 -e "select count(*) from stsAlias;"
     #	146767
     hgsql -N mm9 -e "select count(*) from stsInfoMouseNew;"
     #	66782
     hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
     #	60631
 
     #	compare old and new name lists, not much difference:
     awk '{print $4}' stsMapMouse.bed | sort -u > mm9.nameList
     #	in common with previous version:
     comm -12 \
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
 	mm9.nameList | wc -l
     #	28596
     #	unique to previous version:
     comm -23 \
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
 	mm9.nameList | wc -l
     #	111
     #	unique to this new set:
     comm -13 \
 /cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
 	mm9.nameList | wc -l
     #	1017
 
 ###########################################################################
 #	Reset default position to be same area as Mm8, 2007-08-02 - Hiram
     hgsql -e \
 'update dbDb set defaultPos="chr12:57795963-57815592" where name="mm9";' \
 	hgcentraltest
 
 ##############################################################################
 # CLONE ENDS - BACEND TRACK (DONE - 2007-08-02 - 2007-08-03 - Hiram)
     ssh kkstore06
     cd /cluster/data/mm9
     # check disk space: 1.2T free
     df -h .
 # Filesystem            Size  Used Avail Use% Mounted on
 # /export/cluster/store4
 #			2.3T  997G  1.2T  46% /cluster/store4
     mkdir -p bed/cloneend/ncbi
     cd bed/cloneend/ncbi
 
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*
 
     cd /cluster/data/mm9/bed/cloneend
     # seems like the *.mfa files were split just for convenience
     # concatenate, and convert the title line of the fasta sequences
     cat << '_EOF_' > convert.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 while (my $line = <>) {
     if ($line !~ m/^>/) {
 	print $line
     } else {
         my @fields = split('\|', $line);
 	my $fieldCount = scalar(@fields);
         my $printed = 0;
         for (my $i = 0; $i < $fieldCount; $i++) {
                 if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
                         (my $name, my $vers) = split(/\./,$fields[$i+1]);
                         print ">$name\n";
                         $i= $fieldCount;
                         $printed = 1;
                 }
         }
 	die("Failed for $line\n") if (!$printed);
     }
 }
 '_EOF_'
     # << happy emacs
     chmod +x convert.pl
     for F in ncbi/*.mfa.gz
     do
 	zcat ${F}
     done | ./convert.pl | gzip > cloneEnds.fa.gz
 
     #	make sure nothing got broken:
     faSize ncbi/*.mfa.gz
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper
 #	176421214 lower) in 789466 sequences in 44 files
 
     faSize cloneEnds.fa.gz
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper
 #	176421214 lower) in 789466 sequences in 1 files
     #	identical numbers, curiously, these are exactly the same numbers
     #	as were seen during the build of Mm7.  Do these things not
     #	change with time ?
 
     # concatenate the text files, too
     for F in ncbi/*.txt.gz
     do
 	zcat ${F}
     done | gzip > all.txt.gz
 
     # generate cloneEndPairs.txt and cloneEndSingles.txt
     zcat all.txt.gz | ~/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
     #	Reading in end info
     #	Writing out pair info
     #	Writing out singleton info
     #	354485 pairs and 78423 singles
 
 
     #	faSplit does not function correctly if given a .gz source file
     #	AND, we need the unzipped file for sequence loading below
     gunzip cloneEnds.fa.gz
     # split
     mkdir split
     cd split
     ## adjust split size based on previous kluster performance, see below
     faSplit sequence ../cloneEnds.fa 500 cloneEnds
     #	Check to ensure no breakage:
     faSize c*.fa
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
 # lower) in 789466 sequences in 98 files
 # %35.41 masked total, %36.65 masked real
     #	same numbers as before
 
     #	Copy to san for cluster runs
     mkdir /san/sanvol1/scratch/mm9/cloneEnds
     rsync -a --progress --stats ./ /san/sanvol1/scratch/mm9/cloneEnds/
     rm *
     cd ..
     rmdir split
     #	may as well remove the previous assembly copy:
     rm -fr /san/sanvol1/scratch/mm8/cloneEnds
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/mm9/cloneend
     cd /gbdb/mm9/cloneend
     ln -s /cluster/data/mm9/bed/cloneend/cloneEnds.fa .
     cd /tmp
     hgLoadSeq mm9 /gbdb/mm9/cloneend/cloneEnds.fa
     #  Advisory lock created
     # Creating .tab file
     # Adding /gbdb/mm9/cloneend/cloneEnds.fa
     # 789466 sequences
     # Updating seq table
     # Advisory lock has been released
     # All done
     ## clean up garbage
     rm seq.tab
 
 ############################################################################
 # BACEND SEQUENCE ALIGNMENTS (DONE - 2007-08-06 - Hiram)
     ssh kkstore06
     mkdir /cluster/data/mm9/noMask
     cd /cluster/data/mm9/
     #	Need an unmasked sequence for this work
     for C in `cut -f1 chrom.sizes`
 do
     echo twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
     twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
 done
     # verify nothing broken
     faSize noMask/c*.fa
 # 2725765481 bases (105419509 N's 2620345972 real 2620345972 upper 0 lower) in
 # 35 sequences in 35 files
     # note, this was the same as long ago when the mm9.2bit was measured:
 # 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
 #	1153701322 lower) in 35 sequences in 1 files
 
     # copy to san for kluster run
     mkdir /san/sanvol1/scratch/mm9/noMask
     rsync -a --progress --stats noMask/ /san/sanvol1/scratch/mm9/noMask/
 
     # 11.ooc file is already there from the genbank build
     
     #	and now for the kluster run
     ssh pk
     mkdir /cluster/data/mm9/bed/bacends
     cd /cluster/data/mm9/bed/bacends
     mkdir out
 
     # allow blat to run politely in /tmp while it writes output, then
     # copy results to results file:
     cat << '_EOF_' > runBlat
 #!/bin/csh -fe 
 set root1 = $1
 set root2 = $2
 set result = $3
 rm -fr /scratch/tmp/${root1}_${root2}
 mkdir /scratch/tmp/${root1}_${root2}
 cp -p /san/sanvol1/scratch/mm9/11.ooc /scratch/tmp/${root1}_${root2}
 cp -p /san/sanvol1/scratch/mm9/noMask/${root1}.fa \
 	/scratch/tmp/${root1}_${root2}
 cp -p /san/sanvol1/scratch/mm9/cloneEnds/${root2}.fa \
 	/scratch/tmp/${root1}_${root2}
 pushd /scratch/tmp/${root1}_${root2}
 /cluster/bin/x86_64/blat ${root1}.fa ${root2}.fa \
 	-ooc=11.ooc ${root1}.${root2}.psl
 popd
 mkdir -p out/${root2}
 rm -f ${result}
 cp -p /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
 rm -fr /scratch/tmp/${root1}_${root2}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs happy
 
     ls -1S /san/sanvol1/scratch/mm9/cloneEnds/cloneEnds*.fa > bacEnds.lst
     ls -1S /san/sanvol1/scratch/mm9/noMask/chr*.fa > chrom.lst
     gensub2 chrom.lst bacEnds.lst template jobList
     para create jobList
     # 17150 jobs written to batch
     para try, check, push, etc ...
 # Completed: 17150 of 17150 jobs
 # CPU time in finished jobs:     698826s   11647.09m   194.12h    8.09d  0.022 y
 # IO & Wait Time:                262556s    4375.94m    72.93h    3.04d  0.008 y
 # Average job time:                  56s       0.93m     0.02h    0.00d
 # Longest finished job:             332s       5.53m     0.09h    0.00d
 # Submission to last job:        250536s    4175.60m    69.59h    2.90d
 
     ssh kkstore06
     cd /cluster/data/mm9/bed/bacends
     screen
 
     mkdir temp
     time nice -n +19 pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
     #	real    22m4.019s
     #	-rw-rw-r--    1 8423154460 Aug  6 13:40 raw.psl
 
     time nice -n +19 pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
 	-noIntrons raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
     #	real    6m1.174s
     #	-rw-rw-r--    1 1236810588 Aug  6 13:51 bacEnds.psl
 
     #	split this large psl file into pieces with 100,000 lines each
     #	to prepare for a sort
     time nice -n +19 ~/kent/src/hg/pslSplitOnTarget/pslSplitLineCount.pl \
 	100000 bacEnds.psl split/bacends
     #	real    0m15.389s
 
     #	save original file, then sort
     mv bacEnds.psl bacEnds.psl.save
     time pslSort dirs bacEnds.psl temp split
     #	real    2m19.131s
     #	-rw-rw-r--    1 1236810588 Aug  6 14:38 bacEnds.psl
 
     ## compare to previous results
     wc -l bacEnds.psl /cluster/data/mm8/bed/bacends/bacEnds.psl
     #	10294737 bacEnds.psl
     #	10229750 /cluster/data/mm8/bed/bacends/bacEnds.psl
 
     ## work at top-level directory after this
     mkdir /cluster/data/mm9/bacends
     cp -p bacEnds.psl /cluster/data/mm9/bacends
 
 ############################################################################
 # BACEND PAIRS TRACK (DONE - 2007-08-06 - Hiram)
 
     ssh kolossus
     cd /cluster/data/mm9/bacends
 
     time nice -n +19 pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 	-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 	-mismatch -verbose bacEnds.psl \
 	../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
     #	real    0m49.120s
     ## produces files:
     #	-rw-rw-r--  1     199185 Aug  6 14:46 bacEnds.slop
     #	-rw-rw-r--  1     144486 Aug  6 14:46 bacEnds.short
     #	-rw-rw-r--  1   24399410 Aug  6 14:46 bacEnds.pairs
     #	-rw-rw-r--  1   25421100 Aug  6 14:46 bacEnds.orphan
     #	-rw-rw-r--  1     201794 Aug  6 14:46 bacEnds.mismatch
     #	-rw-rw-r--  1      15928 Aug  6 14:46 bacEnds.long
 
     # create header required by "rdb" tools
     echo -e \
 "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
     echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
 
     cat header bacEnds.pairs | \
 	/cluster/bin/scripts/row score ge 300 | \
 	/cluster/bin/scripts/sorttbl chr start | \
 	/cluster/bin/scripts/headchg -del > bacEndPairs.bed
     #	-rw-rw-r--  1   24201067 Aug  6 14:49 bacEndPairs.bed
 
     cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
 	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
 	/cluster/bin/scripts/sorttbl chr start | \
 	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
     #	-rw-rw-r--  1    6888559 Aug  6 14:49 bacEndPairsBad.bed
 
     /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed >j1.out
     #	-rw-rw-r--  1  989173324 Aug  6 14:52 j1.out
     cat j1.out | /cluster/bin/scripts/sorttbl tname tstart >j2.out
     #	-rw-rw-r--  1  989173324 Aug  6 15:07 j2.out
     cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
     #	-rw-rw-r--  1  989173165 Aug  6 15:08 bacEnds.load.psl
 
     rm j1.out j2.out
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort -u
     #	result should be the scores, no extraneous strings:
 #	1000
 #	300
 #	375
 #	500
 #	750
     #	edit the file and fix it if it has a bad name.
     wc -l bacEnds.load.psl /cluster/data/mm8/bacends/bacEnds.load.psl
     #	8167555 bacEnds.load.psl
     #	8132116 /cluster/data/mm8/bacends/bacEnds.load.psl
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm9/bacends
     hgLoadBed -notItemRgb mm9 bacEndPairs bacEndPairs.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     #	Loaded 239101 elements of size 11
 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -notItemRgb mm9 bacEndPairsBad bacEndPairsBad.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     #	Loaded 84679 elements of size 11
 
     # NOTE: truncates file to 0 if -nobin is used
     time hgLoadPsl mm9 -table=all_bacends bacEnds.load.psl
 # load of all_bacends did not go as planned: 8167555 record(s), 0 row(s)
 # skipped, 2 warning(s) loading psl.tab
 #	real    4m1.142s
     ## to find out what the warnings are about:
     ## first, on hgwdev, dump the loaded table
     hgsql -N -e "select qName from all_bacends;" mm9 \
 	| sort -u > all_bacends.qName.txt
     ## then on kkstore06 compare the resulting load with the requested load file
     diff psl.tab mm9.all_bacends.txt
     ## this diff shows two markers had their qBaseInsert count changed from
     ##	a negative number to a zero since that field is an unsigned
     ## AG326808 and AG609381
 
     ## joinerCheck should be clean:
     joinerCheck -keys -identifier=bacEndNames -database=mm9 all.joiner
 # Checking keys on database mm9
 #  mm9.bacEndPairs.lfNames - hits 478202 of 478202 ok
 
     featureBits mm9 all_bacends
 # 349085662 bases of 2620346127 (13.322%) in intersection
     featureBits mm8 all_bacends
 # 327086559 bases of 2567283971 (12.741%) in intersection
     featureBits mm7 all_bacends
 # 334161740 bases of 2583394090 (12.935%) in intersection
     featureBits mm6 all_bacends
 # 336981828 bases of 2597150411 (12.975%) in intersection
     featureBits mm5 all_bacends
 # 268502414 bases of 2615483787 (10.266%) in intersection
     featureBits mm4 all_bacends
 # 243096171 bases of 2627444668 (9.252%) in intersection
 
     featureBits mm9 bacEndPairs
 # 209909804 bases of 2620346127 (8.011%) in intersection
     featureBits mm8 bacEndPairs
 # 2572527283 bases of 2567283971 (100.204%) in intersection
     featureBits mm7 bacEndPairs
 # 2578837424 bases of 2583394090 (99.824%) in intersection
     featureBits mm6 bacEndPairs
 # 2570768812 bases of 2597150411 (98.984%) in intersection
     featureBits mm5 bacEndPairs
 # 2567958504 bases of 2615483787 (98.183%) in intersection
     featureBits mm4 bacEndPairs
 # 2549945356 bases of 2627444668 (97.050%) in intersection
 
     featureBits mm9 bacEndPairsBad
 # 48850302 bases of 2620346127 (1.864%) in intersection
 
 #######################################################################
 #  Special one-off bacEnds added (DONE - 2008-01-09 - Hiram)
     ssh hgwdev
     # BAC RP23-473N24 was reported missing
     #	its two ends are AZ095043 and AZ095046
     #	end AZ095046 maps just fine to the correct location on chr7
     #	the end AZ095043 does not map correctly when using the -ooc
     #	option to blat.  Run the blat without ooc and it does the
     #	correct thing.  From the genbank record:
     cd /cluster/data/mm9/bed/bacends
     cat << '_EOF_' > AZ095043.fa
 >AZ095043
 TTTATCATGAATGGGTGTTGTATCTTGTCGAAGCTTTTTCCGCATCTAACGAGATGATCATGTGGTTTTT
 GTCTTTGAGTTTGTTTATATAATGGATTACATTGATGGATTTTCATATATTAAACCATCCCTGCATCCCT
 GGAATAAAACCTACTTGGTCAGGATGGATGACTGCCAAGGCGGACCGGG
 '_EOF_'
     blat /san/sanvol1/scratch/mm9/noMask/chr7.fa AZ095043.fa AZ095043.raw.psl
     pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
 	-noIntrons AZ095043.raw.psl AZ095043.psl /dev/null
     #	before adding this one item:
     hgsql -e "select count(*) from all_bacends;" mm9
     #	8167555
     hgLoadPsl -table=all_bacends -append mm9 AZ095043.psl
     #	verify one row added
     hgsql -e "select count(*) from all_bacends;" mm9
     #	8167556
     #	Using the Mm6 records from all_bacends and bacEndPairs as a guide
     #	The bed record for this BAC is therefore:
     cat << '_EOF_' > RP23-473N24.bed
 chr7 150015932 150193247 RP23-473N24 1000 - all_bacends 2 150015932,150192880 172,367 AZ095043,AZ095046
 '_EOF_'
     #	verify rows before adding this one new row
     hgsql -e "select count(*) from bacEndPairs;" mm9
     #	239101
     #	YOW !  The -oldTable option didn't work !  I'm guessing that with
     #	the -sqlTable argument it became confused
     hgLoadBed -oldTable -notItemRgb mm9 bacEndPairs RP23-473N24.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     #	reload everything:
     cat ../../bacends/bacEndPairs.bed RP23-473N24.bed \
 	| hgLoadBed -notItemRgb mm9 bacEndPairs stdin \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
 
 #######################################################################
 ## create random contigs for genscan and other alignment tasks
 ## DONE - 2007-08-07 - Hiram
     ssh kkstore06
     mkdir randomContigs
     for L in ?/lift/random.lft ??/lift/random.lft
 do
     D=${L/\/lift*}
     echo $L $D
     ~/kent/src/hg/utils/lft2BitToFa.pl mm9.2bit ${L} \
 	> randomContigs/chr${D}_random.ctg.fa
 done
     #
     #	Verify these *.ctg.fa files have the same bases as the ordinary
     #	chr*_random.fa files:
     ## don't have these fasta files yet, extract them from the 2bit
     grep random chrom.sizes | cut -f1 | sed -e "s/^chr//; s/_random//" \
 	| while read C
 do
     echo "twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa"
     twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa
 done
     ## now we can measure them
     faSize ?/chr?_random.fa ??/chr??_random.fa
     #	70853964 bases (9033771 N's 61820193 real 26427973 upper
     #	35392220 lower) in 13 sequences in 13 files
 
     ## and our contig versions
     faSize randomContigs/*.ctg.fa
     #	62053964 bases (233771 N's 61820193 real 26427973 upper
     #	35392220 lower) in 189 sequences in 13 files
     ## note, same number of real, upper and lower, only different N's
 
     ## it would be nice to have the actual chroms too
     grep -v random chrom.sizes | cut -f1 | sed -e "s/^chr//" \
 	| while read C
 do
     echo "twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa"
     twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa
 done
     # measure that result
     faSize ?/chr?.fa ??/chr??.fa
     #	2654911517 bases (96385738 N's 2558525779 real 1438609919
     #	upper 1119915860 lower) in 22 sequences in 22 files
     ## is this the amount of sequence specified in chrom.sizes ?
     grep -v random chrom.sizes | ave -col=2 stdin | grep total
     #	total 2654911517.000000
     ## same number, nothing lost
 
 #########################################################################
 # GENSCAN PREDICTIONS (DONE - 2007-08-07 - 2007-08-10 - Hiram)
     ssh kkstore06
     #	Create a 2bit file with the full chrom sequences and the
     #	random contigs, all hard masked
     ## later it was found that chr16_random.ctg.fa should not be in
     ##	this genscan run.  So, it was temporarily taken out of this directory
     ## and this sequence was rerun to avoid it.
     cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
 	| maskOutFa stdin hard stdout \
 	    | faToTwoBit stdin mm9Chroms_RandomContigs.hard.2bit
     #  with chr16_random removed:
     #	2716961487 bases (1251923595 N's 1465037892 real 1465037892 upper 0
     #	lower) in 210 sequences in 1 files
 
     #	make sure it still has all the unmasked sequence in it: (incl 16)
     twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
 	| faSize stdin
     # 2716965481 bases (1251927589 N's 1465037892 real 1465037892 upper
     #	0 lower) in 211 sequences in 1 files
     twoBitToFa mm9.2bit stdout | faSize stdin
     # 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
     # 1155308080 lower) in 35 sequences in 1 files
     #	note the upper bases are the same, the lowers have become N's
     #	lower 1155308080 + upper 1465037892 = 2620345972 real
     #	N's 1251927589 - N's 105419509  = 1146508080 ==
     #		N's in gaps between contigs
 
     #	And, make sure there aren't any sequences in this lot that have
     #	become all N's with no sequence left in them.  This drives genscan nuts
     twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
 	| faCount stdin > chroms_randoms.faCount
     #	the lowest three are:
     egrep -v "^#|^total" chroms_randoms.faCount \
 	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
     #	NT_166474.1 75
     #	NT_166461.1 66
     #	NT_166481.1 39
     #	NT_166325.1 0
     ## This last one is the entire chr16_random and it is only 3,994 bases
     ## long and is marked entirely by RepeatMasker as a line.  It needs
     ## to be skipped during the run of genscan.  Go back to the 2bit creation
     ## and do not include chr16_random
 
     #	creating 4,000,000 sized chunks, the chroms stay together as
     #	single pieces.  The contigs get grouped together into 4,000,000
     #	sized fasta files.  You don't want to break these things up
     #	because genscan will be doing its own internal 2.4 million
     #	window on these pieces, and the gene names are going to be
     #	constructed from the sequence name in these fasta files.  The
     #	gene names are much better when they are this simple chrN.M
     #	numbering scheme, or in the case of a contig: contig_name.M
     #	where the M is a sequence number that genscan will assign to
     #	each gene it discovers.
     mkdir hardChunks
     twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
 	| faSplit about stdin 4000000 hardChunks/c_
     ssh kkr1u00
     mkdir /iscratch/i/mus/mm9/hardChunks
     cd /iscratch/i/mus/mm9/hardChunks
     rsync -a --progress /cluster/data/mm9/hardChunks/ .
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/hardChunks/
 done
 
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/genscan
     cd /cluster/data/mm9/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/mm9/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1Sr /iscratch/i/mus/mm9/hardChunks/c_*.fa > genome.list
 
     ## for next time, this isn't a parasol safe method of operation.
     ## if genscan is writing answers to gtf/ pep/ and subopt/ during
     ##	its operation and it fails. parsol wouldn't be able to verify that
     ##	it was complete merely by file existence check.  This should work
     ##	in scratch/tmp entirely, then copy results back after it is done.
     # Create template file, for gensub2.  For example (3-line file):
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 genome.list single template jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 35 of 36 jobs
 # CPU time in finished jobs:     279581s    4659.68m    77.66h    3.24d  0.009 y
 # IO & Wait Time:                  3390s      56.50m     0.94h    0.04d  0.000 y
 # Average job time:                8085s     134.75m     2.25h    0.09d
 # Longest finished job:           32422s     540.37m     9.01h    0.38d
 # Submission to last job:        122301s    2038.35m    33.97h    1.42d
 
     #	There was a failed job, going to kolossus and running it again,
     #	it takes a very long time, and fails with this cryptic error:
     #	No overlap between a and b in mergeTwo
     ssh kolossus
     cd /cluster/data/mm9/bed/genscan
     time /cluster/bin/x86_64/gsBig /iscratch/i/mus/mm9/hardChunks/c_06.fa \
         gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
         -exe=hg3rdParty/genscanlinux/genscan \
         -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
         -window=2400000
     #	real    922m2.382s
     #	run it with a reduced window size to see if it will complete
     time nice -n +19 /cluster/bin/x86_64/gsBig \
 	/iscratch/i/mus/mm9/hardChunks/c_06.fa \
         gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
         -exe=hg3rdParty/genscanlinux/genscan \
         -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
         -window=2000000
     #	real    648m24.682s
     ## that one failed too, with an error:
 # /scratch/tmp/temp_gsBig_10943_chr7_38.genscan is not a GENSCAN output file
     ## and the contents of that file said:
 # Insufficient memory error: results may be unreliable.
 # Try running program an a portion of sequence.
     #	Let's try splitting up this chr7 on the gaps, which there are plenty
     #	of in this hard masked sequence.  Ended up breaking the chr7 sequence
     #	with the non bridged lift file.  See the lft2BitToFa.pl file in
     #	the chr7_split directory.
     #	on kkstore06
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/genscan/chr7_split
     cd /cluster/data/mm9/bed/genscan/chr7_split
     ./lft2BitToFa.pl ../../../mm9.2bit *.lft > chr7.contigs.hard.fa
 
     mkdir /cluster/data/mm9/bed/genscan/chr7_run
     cd /cluster/data/mm9/bed/genscan/chr7_run
     mkdir split
     faSplit sequence ../chr7_split/chr7.contigs.hard.fa 100 split/chr7_
 
     ## Now, on the small kluster
     ssh kki
     cd /cluster/data/mm9/bed/genscan/chr7_run
     mkdir gtf pep subopt
 
     # Create template file, for gensub2.  For example (3-line file):
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../hg3rdParty/genscanlinux/genscan -par=../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     ls -1S split/chr7_*.fa > chr7.list
     gensub2 chr7.list single template jobList
     para create jobList
     para try ... check ... push ... etc...
 # Completed: 15 of 15 jobs
 # CPU time in finished jobs:       4226s      70.43m     1.17h    0.05d  0.000 y
 # IO & Wait Time:                   215s       3.59m     0.06h    0.00d  0.000 y
 # Average job time:                 296s       4.93m     0.08h    0.00d
 # Longest finished job:             861s      14.35m     0.24h    0.01d
 # Submission to last job:           861s      14.35m     0.24h    0.01d
     # lift these chr7 results into a single file,
     #	fixup the gene names with the sed to remove the lift name effect
     ssh kkstore06
     cd /cluster/data/mm9/bed/genscan/chr7_run
     cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout \
 	../chr7_split/nonBridgedChr7.lft error stdin \
 	| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.gtf
     cat subopt/chr7_*.bed | liftUp -type=.bed stdout \
 	../chr7_split/nonBridgedChr7.lft error stdin \
 	| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.subopt.bed
     cat pep/chr7_*.pep | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.pep
     ## these results become the c_06 results in the main run
     cp -p chr7.pep ../pep/c_06.pep
     cp -p chr7.subopt.bed ../subopt/c_06.bed
     cp -p chr7.gtf ../gtf/c_06.gtf
 
     ## after the chr7 business above, back to the mainline processing
     # cat and lift the results into single files
     ssh kkstore06
     cd /cluster/data/mm9/bed/genscan
     cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
 	../../jkStuff/mm9.contigs.lift carry stdin
     cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
 	../../jkStuff/mm9.contigs.lift carry stdin
     cat pep/c_*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/mm9/bed/genscan
     ldHgGene mm9 -gtf genscan genscan.gtf
     #	Read 45189 transcripts in 324075 lines in 1 files
     #	45189 groups 34 seqs 1 sources 1 feature types
     #	45189 gene predictions
 
     hgPepPred mm9 generic genscanPep genscan.pep
     hgLoadBed mm9 genscanSubopt genscanSubopt.bed
     #	Loaded 525904 elements of size 6
 
     #	check the numbers
     time nice -n +19 featureBits mm9 genscan
     #	55293837 bases of 2620346127 (2.110%) in intersection
     time nice -n +19 featureBits mm8 genscan
     #	54455852 bases of 2567283971 (2.121%) in intersection
     time nice -n +19 featureBits mm8 knownGene:cds
     #	28459053 bases of 2567283971 (1.109%) in intersection
     featureBits mm7 genscan
     #	54864694 bases of 2583394090 (2.124%) in intersection
     time nice -n +19 featureBits mm7 knownGene:cds
     #	27531524 bases of 2583394090 (1.066%) in intersection
 
     featureBits mm9 genscanSubopt
     #	57044145 bases of 2620346127 (2.177%) in intersection
     featureBits mm8 genscanSubopt
     #	57048581 bases of 2567283971 (2.222%) in intersection
     featureBits mm7 genscanSubopt
     #	57512333 bases of 2583394090 (2.226%) in intersection
     featureBits mm6 genscanSubopt
     #	57856316 bases of 2597150411 (2.228%) in intersection
     featureBits mm5 genscanSubopt
     #	58474899 bases of 2615483787 (2.236%) in intersection
     featureBits mm4 genscanSubopt
     #	59601009 bases of 2627444668 (2.268%) in intersection
     featureBits mm3 genscanSubopt
     #	56085184 bases of 2505900260 (2.238%) in intersection
 
 #############################################################################
 # BLASTZ SELF (DONE - 2007-08-07 - 2007-08-31 - Hiram)
 #	using chain min score of 10,000 to cut down on volumn of data
 #  trying a two pass sequence, chroms with chroms, then randoms to chroms
 #  swap the randoms, then combine the three results into a final set
     ssh kkstore06
     cd /cluster/data/mm9
     time nice -n +19 faToTwoBit ?/chr?.fa ??/chr??.fa mm9.chroms.2bit
     time nice -n +19 faToTwoBit randomContigs/chr*.ctg.fa mm9.randomContigs.2bit
 
     ssh kkr1u00
     cd /iscratch/i/mus/mm9
     cp -p /cluster/data/mm9/mm9.chroms.2bit .
     cp -p /cluster/data/mm9/mm9.randomContigs.2bit .
     twoBitInfo mm9.chroms.2bit mm9.chroms.sizes
     twoBitInfo mm9.randomContigs.2bit mm9.randomContgs.sizes
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
 done
 
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
 
     cat << '_EOF_' > DEF
 # mouse vs mouse
 BLASTZ_H=2000
 BLASTZ_M=200
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
 SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm9
 SEQ2_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
 SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     ## run this in a screen on kkstore06
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
 	-stop=load `pwd`/DEF > blastz.out 2>&1 &
     #	This was a tricky one to complete.  A situation was fixed in the
     #	blastz-run-ucsc script which may have helped, but then there were
     #	32 jobs that would only complete on the kki kluster.  The kk nodes
     #	complained about running out of memory.  After a completed run was
     #	finished, and verified:
     ssh kkstore06
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/psl
     find . -type f | wc -l
     #	77284
     wc -l ../run.blastz/jobList
     #	wc -l ../run.blastz.jobList
     #	finished the rest by continuing at the 'cat' step:
     time doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
 	-continue=cat -stop=load `pwd`/DEF > cat.out 2>&1 &
     #	real    285m33.094s
     #  failed during the load because of the SEQ?_LEN specification pointing
     #	to /iscratch/i which is not available on hgwdev.  So, only use
     #	the primary /cluster/data/mm9/chrom.sizes for the DEF file in the future
     #  ran the load step manually to complete with the loadUp.csh fixed.
 
     ssh kolossus
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
     time nice -n +19 featureBits mm9 chainSelfLink \
 	> fb.mm9.chainSelfLink.noRandoms.txt 2>&1
     #	real    24m54.883s
     cat fb.mm9.chainSelfLink.noRandoms.txt
     #	323062218 bases of 2620346127 (12.329%) in intersection
 
     cd /cluster/data/mm9/bed
     ln -s blastzSelf.2007-08-07 blastz.mm9
     ## prepare 2bit file of only the randoms
     ssh kkstore06
     cd /cluster/data/mm9
     faToTwoBit ?/chr?_random.fa ??/chr??_random.fa mm9.randoms.2bit
     # and the sizes files
     twoBitInfo mm9.randomContigs.2bit mm9.randomContigs.sizes
     twoBitInfo mm9.randoms.2bit mm9.randoms.sizes
     # a cluster run for just these bits of sequence
     mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
     cat << '_EOF_' > DEF
 # mouse vs mouse randoms
 PATH=/cluster/bin/penn/x86_64:/cluster/bin/penn:/cluster/bin/scripts:/cluster/bin/x86_64:/bin:/usr/bin
 
 BLASTZ_H=2000
 BLASTZ_M=200
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
 SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm9 randoms only
 SEQ2_DIR=/cluster/data/mm9/mm9.randoms.2bit
 SEQ2_LEN=/cluster/data/mm9/mm9.randoms.sizes
 SEQ2_CTGDIR=/cluster/data/mm9/mm9.randomContigs.2bit
 SEQ2_CTGLEN=/cluster/data/mm9/mm9.randomContigs.sizes
 SEQ2_LIFT=/cluster/data/mm9/jkStuff/mm9.contigs.lift
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=20
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -ignoreSelf \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
 	-stop=net `pwd`/DEF > blastz.out 2>&1 &
     #	now swap the primary chroms back to the randoms
     mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
     chainSwap ../randomsOnly/axtChain/mm9.mm9.all.chain.gz stdout \
         | nice chainSort stdin stdout | nice gzip -c \
         > mm9.mm9.all.chain.gz
 
     #	And then combine all three sets together
     mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
     cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
     chainSplit chain ../axtChain/mm9.mm9.all.chain.gz \
         ../randomsOnly/axtChain/mm9.mm9.all.chain.gz \
         ../randomsSwap/mm9.mm9.all.chain.gz
     #	get them sorted by score correctly
     mkdir chainSort
 for F in `(cd chain; ls *.chain)`
 do
     echo $F
     chainSort chain/$F chainSort/$F
 done
     #	re-number the chains consistently
     chainMergeSort chainSort/*.chain | nice gzip -c > mm9.mm9.all.chain.gz
     rm -fr chain
     mv chainSort chain
     #  and for loading, split this consistently numbered set
     rm -fr chain
     time nice -n +19 chainSplit chain mm9.mm9.all.chain.gz
     #	real    5m0.666s
     ## using a manually fixed up netChains.csh script:
     time nice -n +19 ./netChains.csh > netChains.out 2>&1
     #	real    147m53.147s
     ssh hgwdev
     ## using a manually fixed up loadUp.csh script:
     #		(from ../axtChain/loadUp.csh)
     time nice -n +19 ./loadUp.csh > loadUp.out 2>&1 &
     #	real    99m17.895s
     time nice -n +19 featureBits mm9 chainSelfLink > fb.mm9.chainSelfLink 2>&1
     #	real    30m3.402s
     #	378849408 bases of 2620346127 (14.458%) in intersection
     cat /cluster/data/mm8/bed/blastzSelf.2006-03-20/fb.mm8.chainSelfLink
     #	362483673 bases of 2567283971 (14.119%) in intersection
 
     # finish off the nets
     time nice -n +19 netClass -verbose=0 -noAr noClass.net mm9 mm9 mm9.mm9.net
     #	real    1m9.538s
     # load nets (not needed for the RR, but useful on genome-test)
     time nice -n +19 netFilter -minGap=10 mm9.mm9.net \
 	| hgLoadNet -verbose=0 mm9 netSelf stdin
     #	real    0m40.709s
 
     ## We don't deliver this track to the RR, so downloads are not necessary
 
 #############################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2007-08-07 - Hiram)
     ssh kkr1u00
     mkdir /iscratch/i/mus/mm9/rmsk
     cd /cluster/data/mm9
     cp -p */chr*.fa.out /iscratch/i/mus/mm9/rmsk
     cd /iscratch/i/mus/mm9
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
     done
     cd rmsk
 
     ssh kki
     mkdir /cluster/data/mm9/linSpecRep
     cd /cluster/data/mm9/linSpecRep
     ls -1S /iscratch/i/mus/mm9/rmsk > fa.list
     
     cat << '_EOF_' > mkLSR
 #!/bin/csh -fe
 pushd /iscratch/i/mus/mm9/rmsk
 rm -f $1_homo-sapiens_rattus_canis-familiaris_bos-taurus
 /cluster/bluearc/RepeatMasker070517/DateRepeats \
     $1 -query mouse -comp human -comp rat -comp dog -comp cow
 popd
 /bin/cp -p \
   /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus .
 rm -f /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 fa.list single template jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:       1498s      24.96m     0.42h    0.02d  0.000 y
 # IO & Wait Time:                   193s       3.22m     0.05h    0.00d  0.000 y
 # Average job time:                  48s       0.81m     0.01h    0.00d
 # Longest finished job:             102s       1.70m     0.03h    0.00d
 # Submission to last job:          3399s      56.65m     0.94h    0.04d
 
     ssh kkstore06
     cd /cluster/data/mm9/linSpecRep
     mkdir notInHuman notInRat notInDog notInCow notInRabbit
     for F in chr*.out_homo-sapiens*
     do
 	B=${F/.fa.out*/}
 	echo $B 
         /cluster/bin/scripts/extractRepeats 1 ${F} > \
 		notInHuman/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 2 ${F} > \
 		notInRat/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 3 ${F} > \
 		notInDog/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 4 ${F} > \
 		notInCow/${B}.out.spec
     done
 
     #	the notInHuman, notInDog, and notInCow ended up being
     #	identical.  Only the notInRat was different than them
     #	To check identical
     find . -name "*.out.spec" | \
 	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
 	| sort -k1,1n | sort -t"/" -k3,3
     #	Copy to iscratch for use in kluster runs
     ssh kkr1u00
     mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInRat
     mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInOthers
     cd /iscratch/i/mus/mm9/linSpecRep/notInRat
     cp -p /cluster/data/mm9/linSpecRep/notInRat/* .
     cd /iscratch/i/mus/mm9/linSpecRep/notInOthers
     cp -p /cluster/data/mm9/linSpecRep/notInHuman/* .
     #	copy this directory to the other Iservers
     cd /iscratch/i/mus/mm9
     for R in 2 3 4 5 6 7 8
 do
     rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
 done
 
     #	and we can do the Iservers simply:
     ssh kkr1u00
     cd /iscratch/i/mm9
     #	no longer need these two directories
     rm -fr fa rmsk
     rsync -a --progress /cluster/bluearc/scratch/hg/mm9/ .
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
     done
 
     # We also need the nibs for blastz runs with lineage specific repeats
     ssh kkstore06
     mkdir /cluster/data/mm9/nib
     cd /cluster/data/mm9
     for FA in ?/chr*.fa ??/chr*.fa
 do
     F=${FA/*\//}
     F=${F/.fa/}
     echo faToNib -softMask ${FA} nib/${F}.nib
     faToNib -softMask ${FA} nib/${F}.nib
 done
     #  copied to /cluster/bluearc/scratch/data/mm9/nib/
     #  and everything else we will need for kluster runs into
     #	/cluster/bluearc/scratch/data/mm9/
     # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes
 
 #########################################################################
 # BLASTZ RAT Rn4 (DONE - 2007-08-09 - 2007-08-15 - Hiram)
 #  re-run a second time with tighter parameters, see below for second run
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-09
     cd /cluster/data/mm9/bed/blastzRn4.2007-08-09
     #	Started this before the rsync to /scratch/data/mm9/ had completed,
     #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
     #	here.
 
     cat << '_EOF_' > DEF
 # mouse vs rat
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib
 SEQ1_SMSK=/cluster/bluearc/scratch/data/mm9/notInRat
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/iscratch/i/rn4/nib
 SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-09
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     ## had to fix the blast-run-ucsc script to get these to complete.
     # the chr16_random sequence was causing problems because it has no usable
     # sequence in it for blastz to work with.  And finally, two jobs needed to
     # be run manually on kolossus, don't know what happened with them,
     # although their output was immense:
 # -rw-rw-r--  1 15054644 Aug 14 10:22 chr2.nib:chr2:80000000-90010000_chr7.nib:chr7:0-10000000.psl
 # -rw-rw-r--  1 18992595 Aug 14 11:02 chr2.nib:chr2:80000000-90010000_chr3.nib:chr3:70000000-80000000.psl
     #	I suspect there is something going on with large results and running on
     #	the kk nodes.  I'm getting the same trouble with the self blastz.
 
     #  then, continuing with the cat
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     # real    239m51.356s
     cat fb.mm9.chainRn4Link.txt
     #	1791195056 bases of 2620346127 (68.357%) in intersection
     cat /cluster/data/mm8/bed/blastz.rn4/fb.mm8.chainRn4Link
     #	1770319811 bases of 2567283971 (68.957%) in intersection
     cd /cluster/data/mm9/bed
     ln -s blastzRn4.2007-08-09 blastz.rn4
 
     mkdir /cluster/data/rn4/bed/blastz.mm9.swap
     cd /cluster/data/rn4/bed/blastz.mm9.swap
     time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap /cluster/data/mm9/bed/blastzRn4.2007-08-09/DEF > swap.out 2>&1 &
     #	real    209m11.032s
 
     cd /cluster/data/rn4/bed
     ln -s blastz.mm9.swap blastz.mm9
     cat /cluster/data/rn4/bed/blastz.mm9/fb.rn4.chainMm9Link.txt
     #	1788261968 bases of 2571531505 (69.541%) in intersection
     cat  /cluster/data/rn4/bed/blastz.mm8/fb.rn4.chainMm8Link.txt
     #	1791093685 bases of 2571531505 (69.651%) in intersection
 
 #########################################################################
 ## multiple alignment preparation stats
 #	The following table will keep track of the pairwise alignments
 #	completed.  (The % NN.Nxx mean not done yet)
 #                         featureBits chainLink measures
 #                                           chainMm9Link   chain   linearGap
 #    distance                       on Mm9      on other   minScore
 #  1  0.1587 - rat rn4            (% 68.357)  (% 69.541)   3000     medium
 #  2  0.4677 - human hg18         (% 38.499)  (% 35.201)   3000     medium
 
 #  3  0.4686 - chimp panTro2      (% 37.5xx)  (% 33.6xx)   3000     medium
 #  4  0.4960 - macaque rheMac2    (% 34.7xx)  (% 33.1xx)   3000     medium
 #  5  0.5131 - rabbit oryCun1     (% 19.3xx)  (no swap )   3000     medium
 #  6  0.6142 - armadillo dasNov1  (% 16.8xx)  (no swap )   3000     medium
 #  7  0.6230 - dog canFam2        (% 32.2xx)  (% 34.2xx)   3000     medium
 #  8  0.6256 - elephant loxAfr1   (% 18.3xx)  (no swap )   3000     medium
 #  9  0.6344 - cow bosTau2        (% 26.8xx)  (% 24.2xx)   3000     medium
 # 10  0.7805 - tenrec echTel1     (% 11.4xx)  (no swap )   5000     loose
 # 11  1.0698 - opossum monDom4    (%  8.2xx)  (%  6.0xx)   5000     loose
 # 12  1.3425 - chicken galGal2    (%  2.5xx)  (%  5.4xx)   5000     loose
 # 13  1.7936 - frog xenTro2       (%  2.6xx)  (%  5.3xx)   5000     loose
 # 14  2.0157 - tetraodon tetNig1  (%  1.9xx)  (% 13.7xx)   5000     loose
 # 15  2.0562 - fugu fr1           (%  1.9xx)  (% 13.5xx)   5000     loose
 # 16  2.1059 - zebrafish danRer5  (%  2.1xx)  (%  3.5xx)   5000     loose
 
 ##########################################################################
 ## BLASTZ SWAP from Hg18 to Mm9 (DONE - 2007-08-15 - Hiram)
     #	also in hg18.txt
     cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
     #	1014323175 bases of 2881515245 (35.201%) in intersection
 
     #	Then to swap over to Mm9
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastz.hg18.swap
     cd /cluster/data/mm9/bed/blastz.hg18.swap
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
 	-chainLinearGap=medium \
 	/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
     #	real    67m21.146s
     cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
     #	1008812599 bases of 2620346127 (38.499%) in intersection
     cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
     #	984380268 bases of 2567283971 (38.343%) in intersection
 
     cd /cluster/data/mm9/bed
     ln -s blastz.hg18.swap blastz.hg18
 
     ## make swapped syntenic net
     cd /cluster/data/mm9/bed/blastz.hg18.swap
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
 	-swap -syntenicNet -chainLinearGap=medium -continue=syntenicNet \
 	/cluster/data/hg18/bed/blastz.mm9/DEF > syntenic.out 2>&1 &
     ##	real    20m49.712s
 
 #########################################################################
 # BLASTZ RAT Rn4 (DONE - 2007-08-30 - Hiram)
 #  re-run this second time with tighter parameters
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-30
     cd /cluster/data/mm9/bed/blastzRn4.2007-08-30
     #	Started this before the rsync to /scratch/data/mm9/ had completed,
     #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
     #	here.
 
     cat << '_EOF_' > DEF
 # mouse vs rat
 # Specially tuned blastz parameters from Webb Miller
 
 BLASTZ_ABRIDGE_REPEATS=0
 BLASTZ_O=600
 BLASTZ_E=150
 BLASTZ_Y=15000
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-stop=net \
 	`pwd`/DEF > blastz.out 2>&1 &
     #  this runs much faster than the usual blastz run
     #	failed when it got to the kki run since /scratch/hg/rn4/ was not
     #	complete on the Iservers.  Fixup that, then, continue:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-continue=chainMerge -stop=net \
 	`pwd`/DEF > chainMerge.out 2>&1 &
     #	And then, kolossus had no /scratch/data/ directory, go there and
     #	make this a symlink to /iscratch/data/
     #	and run the axtChain/netChains.csh script manually on kolossus
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-08-31 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/mm9/bed/blastzOryLat1.2007-08-30
     cd /cluster/data/mm9/bed/blastzOryLat1.2007-08-30
 
     cat << '_EOF_' > DEF
 # mouse vs medaka
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
 #       chrUn in Scaffolds for this alignment run
 SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzOryLat1.2007-08-30
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    512m56.909s
     #  had a single failed kk job, finished manually, then:
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	real    11m5.508s
     ## typical failure:
     #	HgStepManager: executing step 'net' Fri Aug 31 10:02:51 2007.
     #	netChains: looks like previous stage was not successful (can't find [mm9.oryLat1.]all.chain[.gz]).
     # continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
     #	real    21m33.501s
     cat fb.mm9.chainOryLat1Link.txt
     #	50650171 bases of 2620346127 (1.933%) in intersection
 
     # and the swap
     mkdir /cluster/data/oryLat1/bed/blastz.mm9.swap
     cd /cluster/data/oryLat1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl \
 	/cluster/data/mm9/bed/blastzOryLat1.2007-08-30/DEF \
 	-chainMinScore=5000 -qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     cat fb.oryLat1.chainMm9Link.txt
     #	45488232 bases of 700386597 (6.495%) in intersection
 
 #########################################################################
 # LOAD ACEMBLY (DONE 9/17/07 angie)
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/acembly
     cd /cluster/data/mm9/bed/acembly
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.genes_gff.tar.gz
     wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.good_proteins_fasta.tar.gz
     tar xvzf AceView.mm_37.genes_gff.tar.gz
     tar xvzf AceView.mm_37.good_proteins_fasta.tar.gz
 
     cd AceView.mm_37.genes_gff
     # If the result of this command is > 0, then some lines have end < start 
     # and need to be fixed:
     awk '$5 < $4 {print;}' *.gff | wc -l
 #0
 
     # Add "chr" prefix:
     sed -e 's/^/chr/;' x1*.gff > acembly.gff
 
     # Extract annotation types from original gff:
     perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                  s/Main$/main/ || s/Putative$/putative/ || \
                    die "Unrecognized class:\n$_\n";' *.gff \
     | sort -u \
       > acemblyClass.tab
 
     # Keep tabs on the transcript names that end in -unspliced --
     # the first time around, had to add that suffix to some protein names
     # in order to get all of them to match.  runJoiner is the real test.
     grep unspliced acemblyClass.tab | wc -l
 #54774
 
     # Pare down proteins to just the ones that we have transcripts for:
     cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
     awk '{print $1;}' ../AceView.mm_37.genes_gff/acemblyClass.tab \
       > transcriptNames.txt
     cat *.fasta \
     | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
     grep unspliced acemblyPep.fa | wc -l
 #45033
     # Danielle Thierry-Mieg explained that noncoding genes are included so
     # the number of proteins can be smaller than the number of transcripts.
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/mm9/bed/acembly/AceView.mm_37.genes_gff
     ldHgGene -gtf mm9 acembly acembly.gff
 #Read 173008 transcripts in 2366104 lines in 1 files
 #  173008 groups 21 seqs 1 sources 5 feature types
     hgLoadSqlTab mm9 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
       acemblyClass.tab
     cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
     hgPepPred mm9 generic acemblyPep acemblyPep.fa
     rm acemblyPep.tab
     runJoiner.csh mm9 acembly
 # mm9.acemblyPep.name - hits 149560 of 149560 ok
 # mm9.acemblyClass.name - hits 173008 of 173008 ok
 
 
 #########################################################################
 # BLASTZ RAT Rn4 (DONE - 2007-08-30 - 2007-09-11 - Hiram)
 #  re-run this third time with a special matrix from Bob Harris/Webb Miller
     cat /cluster/data/blastz/mouse_rat.q
     A    C    G    T
     56 -109  -45 -137
   -109  100 -103  -45
    -45 -103  100 -109
   -137  -45 -109   56
 O=600 E=55
 
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-31
     cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
     #	Started this before the rsync to /scratch/data/mm9/ had completed,
     #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
     #	here.
 
     cat << '_EOF_' > DEF
 # mouse vs rat
 # Specially tuned blastz parameters from Webb Miller
 
 BLASTZ_ABRIDGE_REPEATS=0
 BLASTZ_O=600
 BLASTZ_E=55
 BLASTZ_Y=15000
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/cluster/data/blastz/mouse_rat.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
 SEQ2_LEN=/cluster/data/rn4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-31
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-stop=net `pwd`/DEF > blastz.out 2>&1 &
     #	real    243m51.078s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-continue=download -stop=download `pwd`/DEF > download.out 2>&1 &
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-continue=cleanup -syntenicNet `pwd`/DEF > syntenicNet.out 2>&1 &
     cat fb.mm9.chainRn4Link.txt
     #	1713186474 bases of 2620346127 (65.380%) in intersection
 
     #	and the swap
     mkdir /cluster/data/rn4/bed/blastz.mm9.swap
     cd /cluster/data/rn4/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
 	-swap -syntenicNet > swap.out 2>&1 &
     #	real    314m59.840s
     cat  fb.rn4.chainMm9Link.txt
     #	1711034941 bases of 2571531505 (66.538%) in intersection
 
 #########################################################################
 # EXONIPHY MM9, lifted from hg18 (DONE - 2007-09-05 - Hiram)
 #	needed for uscsGenes10 building
     # create a syntenic liftOver chain file
     ssh kolossus
     cd /cluster/data/hg18/bed/blastz.mm9/axtChain
     time nice -n +19 netFilter -syn hg18.mm9.net.gz \
 	| netChainSubset -verbose=0 stdin hg18.mm9.all.chain.gz stdout \
 	| chainStitchId stdin stdout | gzip -c > hg18.mm9.syn.chain.gz
     #	real    5m55.575s
     #	slightly smaller than the ordinary liftOver chain file:
 # -rw-rw-r--  1  77849682 Aug 14 16:49 hg18.mm9.over.chain.gz
 # -rw-rw-r--  1  73972671 Sep  5 15:27 hg18.mm9.syn.chain.gz
 
     # exoniphyMm9.gp is prepared as follows
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/exoniphy
     cd /cluster/data/mm9/bed/exoniphy
     hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
     time nice -n +19 liftOver -genePred exoniphyHg18.gp \
 	/cluster/data/hg18/bed/blastz.mm9/axtChain/hg18.mm9.syn.chain.gz \
 	    exoniphyMm9.gp unmapped
     #	real    52m0.335s
     wc -l *
     #	178162 exoniphyHg18.gp
     #	172859 exoniphyMm9.gp
     #	 10606 unmapped
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/exoniphy
     nice -n +19 hgLoadGenePred -genePredExt mm9 exoniphy exoniphyMm9.gp
     nice -n +19 featureBits mm9 exoniphy
     #	25931742 bases of 2620346127 (0.990%) in intersection
     nice -n +19 featureBits mm8 exoniphy
     #	25952211 bases of 2567283971 (1.011%) in intersection
 
 #########################################################################
 # BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
     ssh kkstore06
     # establish a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
     cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
 
     cat << '_EOF_' > DEF
 # mouse vs dog
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_SMSK=/scratch/data/mm9/notInOthers
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
 SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzCanFam2.2007-09-04
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    871m24.249s
     cat fb.mm9.chainCanFam2Link.txt
     #	848004408 bases of 2620346127 (32.362%) in intersection
 
     mkdir /cluster/data/canFam2/bed/blastz.mm9.swap
     cd /cluster/data/canFam2/bed/blastz.mm9.swap
 
     time /cluster/bin/scripts/doBlastzChainNet.pl \
 	/cluster/data/mm9/bed/blastzCanFam2.2007-09-04/DEF \
 	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
 	-chainLinearGap=medium -swap > swap.out 2>&1 &
     #	real    57m59.126s
     cat fb.canFam2.chainMm9Link.txt
     #	832145360 bases of 2384996543 (34.891%) in intersection
 
     #	need syntenic net for the multiz
     cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-syntenicNet -continue=syntenicNet `pwd`/DEF > synNet.out 2>&1 &
     #	real    19m1.302s
 
 #########################################################################
 # BLASTZ/CHAIN/NET RHEMAC2 (DONE - 2007-09-05 - Hiram)
     # Won't put this in Conservation -- special request for ancestor recon.
     ssh kkstore06
     #	use a screen to control this job
     # XXX note for next time, naming convention is different here than all the
     # others, and there is a missing TMPDIR in the DEF file
     screen
     mkdir /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
     cd /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
 
 
     cat << '_EOF_' > DEF
 # Mouse vs. macacque
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_SMSK=/scratch/data/mm9/notInOthers
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Macacque (rheMac2)
 SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
 SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent
 SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
       -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
       -syntenicNet `pwd`/DEF > do.log 2>&1 &
     #	real    1017m13.247s
     # some kk kluster difficulties, fixup and complete manually
 # Completed: 87616 of 87616 jobs
 # CPU time in finished jobs:   26547195s  442453.25m  7374.22h  307.26d  0.842 y
 # IO & Wait Time:               3384143s   56402.38m   940.04h   39.17d  0.107 y
 # Average job time:                 342s       5.69m     0.09h    0.00d
 # Longest finished job:            3159s      52.65m     0.88h    0.04d
 # Submission to last job:         65814s    1096.90m    18.28h    0.76d
     #	then, continuing
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
       -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
       -continue=cat -syntenicNet `pwd`/DEF > cat.log 2>&1 &
     #	real    255m52.382s
     cat fb.mm9.chainRheMac2Link.txt
     #	998017006 bases of 2620346127 (38.087%) in intersection
     mkdir /cluster/data/rheMac2/bed/blastz.mm9.swap
     cd /cluster/data/rheMac2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05/DEF \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -syntenicNet > swap.log 2>&1 &
     #	real    178m31.911s
     cat fb.rheMac2.chainMm9Link.txt
     #	1094006509 bases of 2646704109 (41.335%) in intersection
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET Orangutan ponAbe1 (DONE - 2007-09-05 - Hiram)
     ssh kkstore01
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
     cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
 
     #	next time, have SEQ2_CHUNK at 30000000 and SEQ2_LIMIT at 100
     #	this caused over 500,000 pk jobs, that is too many
     cat << '_EOF_' > DEF
 # mouse vs orangutan
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Orangutan ponAbe1
 SEQ2_DIR=/scratch/data/ponAbe1/ponAbe1.2bit
 SEQ2_LEN=/cluster/data/ponAbe1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
     #	real    897m58.156s
     #	some pk kluster difficulties, fixup and complete manually
 Completed: 511290 of 511290 jobs
 CPU time in finished jobs:   11448015s  190800.24m  3180.00h  132.50d  0.363 y
 IO & Wait Time:               1852197s   30869.96m   514.50h   21.44d  0.059 y
 Average job time:                  26s       0.43m     0.01h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:             349s       5.82m     0.10h    0.00d
 Submission to last job:         54771s     912.85m    15.21h    0.63d
     #	then, continuing
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=cat -stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
     #	ran into trouble on the kki chain run with stuff missing
     #	from the Iservers /scratch/data/ - rsync them up and get
     #	the run done manually
 # Completed: 24 of 24 jobs
 # CPU time in finished jobs:      17718s     295.30m     4.92h    0.21d  0.001 y
 # IO & Wait Time:                   203s       3.38m     0.06h    0.00d  0.000 y
 # Average job time:                 747s      12.45m     0.21h    0.01d
 # Longest finished job:            3673s      61.22m     1.02h    0.04d
 # Submission to last job:          3886s      64.77m     1.08h    0.04d
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=chainMerge -stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > chainMerge.log 2>&1 &
     #	real    55m27.522s
     cat fb.mm9.chainPonAbe1Link.txt
     #	913843325 bases of 2620346127 (34.875%) in intersection
 
     mkdir /cluster/data/ponAbe1/bed/blastz.mm9.swap
     cd /cluster/data/ponAbe1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05/DEF \
 	-stop=load -chainMinScore=3000 \
 	-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
 
     # create the syntenic maf nets:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=download -syntenicNet -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
     #	real 20m55.024s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 ponAbe1 \
 	> rbest.log 2>&1 &
     #	real    53m43.377s
 
 #########################################################################
 # BLASTZ/CHAIN/NET Marmoset calJac1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
     ssh kkstore06
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
     cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
 
     #	next time, try SEQ2_CHUNK at 40000000, SEQ2_LIMIT at 75
     #	this created 285,570 kluster jobs, that is too many
     cat << '_EOF_' > DEF
 # mouse vs marmoset
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Marmoset calJac1
 SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
 SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzCalJac1.2007-09-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
     #	real    897m58.156s
     #	some pk kluster difficulties, fixup and complete manually
 # Completed: 511290 of 511290 jobs
 # CPU time in finished jobs:   11448015s  190800.24m  3180.00h  132.50d  0.363 y
 # IO & Wait Time:               1852197s   30869.96m   514.50h   21.44d  0.059 y
 # Average job time:                  26s       0.43m     0.01h    0.00d
 # Longest finished job:             349s       5.82m     0.10h    0.00d
 # Submission to last job:         54771s     912.85m    15.21h    0.63d
     #	then, continuing
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=cat -stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
     #	real    669m34.473s
     cat fb.mm9.chainCalJac1Link.txt
     #	863961573 bases of 2620346127 (32.971%) in intersection
 
     mkdir /cluster/data/calJac1/bed/blastz.mm9.swap
     cd /cluster/data/calJac1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
 	-stop=load -chainMinScore=3000 \
 	-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
     #	real    217m10.835s
     cat fb.calJac1.chainMm9Link.txt
     #	887586922 bases of 2929139385 (30.302%) in intersection
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
 	-continue=download -chainMinScore=3000 \
 	-swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 &
     #	real    1m9.876s
 
     #	run the syntenic nets
     time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \
 	-continue=download -chainMinScore=3000 \
 	-syntenicNet -chainLinearGap=medium -bigClusterHub=pk \
 	> syntenicNet.log 2>&1 &
     #	real 22m51.080s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 calJac1 \
 	> rbest.log 2>&1 &
     #	real    47m18.467s
 
 #########################################################################
 # BLASTZ/CHAIN/NET Fugu fr2 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
     ssh kkstore02
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzFr2.2007-09-06
     cd /cluster/data/mm9/bed/blastzFr2.2007-09-06
 
     cat << '_EOF_' > DEF
 # mouse vs medaka
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu fr2
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzFr2.2007-09-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    156m55.151s
     # fixup broken kluster jobs, complete manually
 # Completed: 70395 of 70395 jobs
 # CPU time in finished jobs:    4339015s   72316.91m  1205.28h   50.22d  0.138 y
 # IO & Wait Time:                486414s    8106.90m   135.12h    5.63d  0.015 y
 # Average job time:                  69s       1.14m     0.02h    0.00d
 # Longest finished job:            1098s      18.30m     0.30h    0.01d
 # Submission to last job:         18352s     305.87m     5.10h    0.21d
     # and then continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	real    5m43.977s
 
     #	Still, the typical failure
 # HgStepManager: executing step 'net' Thu Sep  6 16:04:56 2007.
 # netChains: looks like previous stage was not successful (can't find [mm9.fr2.]all.chain[.gz]).
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
     #	real    178m15.798s
      cat fb.mm9.chainFr2Link.txt
     #	47018710 bases of 2620346127 (1.794%) in intersection
 
     mkdir /cluster/data/fr2/bed/blastz.mm9.swap
     cd /cluster/data/fr2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzFr2.2007-09-06/DEF \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     #	real    15m32.368s
     cat fb.fr2.chainMm9Link.txt
     #	42413565 bases of 393312790 (10.784%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET Tetraodon tetNig1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
     ssh kkstore01
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
     cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
 
     cat << '_EOF_' > DEF
 # mouse vs tetraodon
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon tetNig1
 #       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzTetNig1.2007-09-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    535m2.474s
     #	Typical failure
 # HgStepManager: executing step 'net' Fri Sep  7 01:13:06 2007.
 # netChains: looks like previous stage was not successful (can't find [mm9.tetNig1.]all.chain[.gz]).
     # continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
     cat fb.mm9.chainTetNig1Link.txt
     #	46206292 bases of 2620346127 (1.763%) in intersection
 
     mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap
     cd /cluster/data/tetNig1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \
 	-chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-swap -bigClusterHub=kk > swap.log 2>&1 &
     #	real    19m58.885s
     cat fb.tetNig1.chainMm9Link.txt
     #	42256263 bases of 342403326 (12.341%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
     ssh kkstore01
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
     cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
 
     cat << '_EOF_' > DEF
 # mouse vs stickleback
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: stickleback gasAcu1
 SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
 SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
 SEQ2_CHUNK=35000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
 # Completed: 52725 of 52725 jobs
 # CPU time in finished jobs:    4110432s   68507.19m  1141.79h   47.57d  0.130 y
 # IO & Wait Time:                413069s    6884.49m   114.74h    4.78d  0.013 y
 # Average job time:                  86s       1.43m     0.02h    0.00d
 # Longest finished job:            1140s      19.00m     0.32h    0.01d
 # Submission to last job:         71194s    1186.57m    19.78h    0.82d
     #	had some jobs fail on the kk run, finish manually, then continuing:
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	real    120m36.209s
     # failed kki chain job due to san outage on kkr7u00, finished manually:
 # Completed: 24 of 24 jobs
 # CPU time in finished jobs:       1807s      30.12m     0.50h    0.02d  0.000 y
 # IO & Wait Time:                   258s       4.29m     0.07h    0.00d  0.000 y
 # Average job time:                  86s       1.43m     0.02h    0.00d
 # Longest finished job:             257s       4.28m     0.07h    0.00d
 # Submission to last job:          9851s     164.18m     2.74h    0.11d
     #	continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 -verbose=2 \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-continue=chainMerge -bigClusterHub=kk > chainMerge.log 2>&1 &
     #	real    21m7.089s
     cat fb.mm9.chainGasAcu1Link.txt
     #	48448585 bases of 2620346127 (1.849%) in intersection
 
     mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap
     cd /cluster/data/gasAcu1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \
 	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
 	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     cat fb.gasAcu1.chainMm9Link.txt
     #	43730193 bases of 446627861 (9.791%) in intersection
 
 
 #########################################################################
 # BLASTZ Zebrafish danRer5 (DONE - 2007-09-11 - 2007-09-12 - Hiram)
 #	re-run a second time with BLASTZ_Q, see below
     ssh kkstore06
     screen	# use screen to manage this job
     mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
     cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
 
     cat << '_EOF_' > DEF
 # Mouse (mm9) vs zebrafish (danRer5)
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - zebrafish (danRer5)
 SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
 SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    222m47.787s
     cat fb.mm9.chainDanRer5Link.txt
     #	48497464 bases of 2620346127 (1.851%) in intersection
 
     mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
     cd /cluster/data/danRer5/bed/blastz.mm9.swap
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    9m47.163s
     cat fb.danRer5.chainMm9Link.txt
     #	34017483 bases of 1435609608 (2.370%) in intersection
 
 #########################################################################
 # BLASTZ Zebrafish danRer5 (DONE - 2007-09-13 - Hiram)
 #	second time, forgot to include BLASTZ_Q the first time
     ssh kkstore06
     screen	# use screen to manage this job
     mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-13
     cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-13
 
     #	This is the wrong way overlap, but it seems to work
     cat << '_EOF_' > DEF
 # Mouse (mm9) vs zebrafish (danRer5)
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - zebrafish (danRer5)
 SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
 SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    369m16.947s
     cat fb.mm9.chainDanRer5Link.txt
     #	84513268 bases of 2620346127 (3.225%) in intersection
 
     mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
     cd /cluster/data/danRer5/bed/blastz.mm9.swap
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzDanRer5.2007-09-13/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> swap.log 2>&1 &
     #	real    21m44.784s
     cat fb.danRer5.chainMm9Link.txt
     #	66400782 bases of 1435609608 (4.625%) in intersection
 
 #########################################################################
 # BLASTZ/CHAIN/NET Guinea Pig cavPor2 (DONE - 2007-09-19 - kate)
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/blastzCavPor2.2007-09-19
     cd /cluster/data/mm9/bed/blastzCavPor2.2007-09-19
 
     cat << '_EOF_' > DEF
 # mouse vs guinea pig
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Guinea pig cavPor2
 SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes
 
 # chunking similar to cat (similar number of scaffolds)
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=500
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzCavPor2.2007-09-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
         -bigClusterHub=pk >& do.log  &
 
     # load nets manually -- automated loading fails as classification info 
     #  not available (no database)
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastz.cavPor2/axtChain
     netFilter -minGap=10 noClass.net | hgLoadNet -warn mm9 netCavPor2 stdin
     netFilter -minGap=10 mm9.cavPor2.rbest.net.gz |  \
         hgLoadNet -warn mm9 netRBestCavPor2 stdin
 
     doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
         -continue=download >& do2.log &
 
     # reciprocal best net mafs for multiz
     ~/kent/src/hg/utils/automation/doRecipBest.pl mm9 cavPor2 >&! rbest.log &
 
     time nice -n +19 featureBits mm9 chainCavPor2Link \
 	> fb.mm9.chainCavPor2Link.txt 2>&1
     cat fb.mm9.chainCavPor2Link.txt
     #	480194223 bases of 2620346127 (18.326%) in intersection
 
     #	create the syntenic maf nets (these are unneeded):
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
 	-chainLinearGap=medium -continue=syntenicNet -syntenicNet \
 	-bigClusterHub=pk > syntenicNet.log 2>&1
 
 #########################################################################
 ## 4-Way Multiz (DONE - 2007-09-07 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/multiz4way
     cd /cluster/data/mm9/bed/multiz4way
 
     ln -s ../multiz30way/mm9.guess.30way.nh ./30way.nh
 
 leave mm9 rn4, canFam2 and hg18
     /cluster/bin/phast/tree_doctor \
 	--prune panTro2,ponAbe1,rheMac2,calJac1,otoGar1,tupBel1,cavPor2,oryCun1,sorAra1,eriEur1,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer4  30way.nh
 
     # this leaves us with:
     
     cat << '_EOF_' > 4way.nh
 ((hg18:0.126901,
 	(rn4:0.084383,mm9:0.076274):0.249544):0.019763,canFam2:0.187963);
 '_EOF_'
     # << happy emacs
 
     #	Use this specification in the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to obtain a gif image for htdocs/images/phylo/mm9_4way.gif
 
     /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
     #	Use this output to create the table below
     grep -y mm9 4way.distances.txt | sort -k3,3n
 #
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 #
 #                         featureBits chainLink measures
 #                                        chainOryLat1Link   chain    linearGap
 #    distance                      on mm9    on other   minScore
 #  1  0.160657 - rat rn4       (% 65.380) (% xx.xxx)       5000     medium
 #  2  0.452719 - human hg18    (% 38.499) (% 35.201)       3000     medium
 #  3  0.533544 - dog canFam2   (% 32.362) (% 34.891)       3000     medium
 
     #	using the syntenic nets
     cd /cluster/data/mm9/bed/multiz4way
     mkdir mafLinks
     mkdir mafLinks/rn4
     cd mafLinks/rn4
     ln -s ../../../blastzRn4.2007-08-31/mafSynNet/*.maf.gz .
     mkdir ../hg18
     cd ../hg18
     ln -s ../../../blastz.hg18/mafSynNet/*.maf.gz .
     mkdir ../canFam2
     cd ../canFam2
     ln -s ../../../blastz.canFam2/mafSynNet/*.maf.gz .
 
     #	Copy MAFs to some appropriate NFS server for kluster run
     mkdir /san/sanvol1/scratch/mm9/multiz4way
     cd /san/sanvol1/scratch/mm9/multiz4way
     time nice -n +19 rsync -a --copy-links --progress \
 	/cluster/data/mm9/bed/multiz4way/mafLinks/ .
     #	1 minute to copy 2.4 Gb
 
     #	determine what is the newest version of multiz and use that
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn
 
     # the autoMultiz cluster run
     ssh pk
     cd /cluster/data/mm9/bed/multiz4way
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	4way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     mkdir run maf
     cd run
 
     #	NOTE: you need to set the db and multiz dirname properly in this script
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = mm9
 set c = $1
 set maf = $2
 set binDir = /san/sanvol1/scratch/$db/multiz4way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz4way
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz4way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 35 jobs
     para try ... check ... push ... etc ...
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:      27901s     465.02m     7.75h    0.32d  0.001 y
 # IO & Wait Time:                   562s       9.37m     0.16h    0.01d  0.000 y
 # Average job time:                 813s      13.55m     0.23h    0.01d
 # Longest finished job:            2222s      37.03m     0.62h    0.03d
 # Submission to last job:          2222s      37.03m     0.62h    0.03d
 
     #	combine results into a single file for loading and gbdb reference
     ssh kkstore06
     cd /cluster/data/mm9/bed/multiz4way
     time nice -n +19 catDir maf > multiz4way.maf
     #	real    2m43.409s
 
     #	makes a 6.5 Gb file:
     #	-rw-rw-r--  1 6883356263 Sep  7 11:00 multiz4way.maf
 
     #	Create per-chrom individual maf files for downloads
     #	NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS
     ssh kkstore04
     cd /cluster/data/mm9/bed/multiz4way
     mkdir mafDownloads
     time for M in maf/chr*.maf
     do
 	B=`basename $M`
 	cp -p ${M} mafDownloads/${B}
 	gzip mafDownloads/${B}
 	echo ${B} done
     done
     #	real    5m9.273
 
     #	deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH
     #		THE ANNOTATED MAFS
     ssh hgwdev
     ln -s /cluster/data/mm9/bed/multiz4way/mafDownloads \
 	/usr/local/apache/htdocs/goldenPath/mm9/multiz4way
 
     # Load into database
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz4way
     mkdir /gbdb/mm9/multiz4way
     ln -s /cluster/data/mm9/bed/multiz4way/multiz4way.maf \
 	/gbdb/mm9/multiz4way
     time nice -n +19 hgLoadMaf mm9 multiz4way
     #	Loaded 5072051 mafs in 1 files from /gbdb/mm9/multiz4way
     #	real    2m33.680s
 
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 mm9 multiz4waySummary multiz4way.maf
     #	Created 1330454 summary blocks from 9893113 components
     #	and 5068764 mafs from multiz4way.maf
     #	real    3m27.620s
 
     #	Create tree image for details page
     #	You can get a better image from the phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     # with mm9 on top:
 (((mouse_mm9:0.076274,rat_rn4:0.084383):0.249544,human_hg18:0.126901):0.019763,
 dog_canFam2:0.187963);
 
 #########################################################################
 ### GNF ATLAS 2 - required for UCSC Gene/Gene Sorter build
 #	(DONE - 2007-09-10 - Hiram)
     # Align probes from GNF1M chip.
     ssh pk
     mkdir -p /cluster/data/mm9/bed/geneAtlas2/run/psl
     cd /cluster/data/mm9/bed/geneAtlas2/run
 
     cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
 
     ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > probe.list
 
     cat << '_EOF_' > template
 #LOOP
 blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 genome.list probe.list template jobList
     para create jobList
     para try ... check ... push ... etc.
     para time
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:      14865s     247.75m     4.13h    0.17d  0.000 y
 # IO & Wait Time:                   160s       2.66m     0.04h    0.00d  0.000 y
 # Average job time:                 429s       7.15m     0.12h    0.00d
 # Longest finished job:            1151s      19.18m     0.32h    0.01d
 # Submission to last job:          1166s      19.43m     0.32h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
 	../affyGnf1m.psl /dev/null
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/mm9/bed/geneAtlas2
 #    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
     hgLoadPsl mm9 affyGnf1m.psl
     hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/gnf1m.fa
     #	31309 sequences
 
     # Load up track
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
     	affyGnf1m.psl
     #Loaded 34863 rows of expression data from hgFixed.gnfMouseAtlas2MedianRatio
     #	Mapped 30117,  multiply-mapped 1723, missed 882, unmapped 4746
 
     # Note that the unmapped 5000 records are from all-N sequences.
     hgLoadBed mm9 gnfAtlas2 gnfAtlas2.bed
     #	Loaded 31840 elements of size 15
     featureBits mm9 gnfAtlas2
     #	12921627 bases of 2620346127 (0.493%) in intersection
     featureBits mm8 gnfAtlas2
     #	12858280 bases of 2567283971 (0.501%) in intersection
 
     #	during the build of UCSC genes, this sequence takes place:
     hgMapToGene mm9 affyGnf1m knownGene knownToGnf1m
     hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
 	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
     #	this hgExpDistance command takes some time, maybe an hour or so ?
     #	Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
     #	Got 31145 unique elements in hgFixed.gnfMouseAtlas2MedianRatio
     hgMapToGene mm9 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 
 ############################################################################
 ### affyU74 TRACK - needed for the Gene Sorter (DONE - 2007-09-10 - Hiram)
 #                              
 # MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
 # target sequences. Recalculate alignments and load data
 #
 #	The affy data has previously been loaded to iscratch in:
 #	/iscratch/i/affy
 # It originates from:
 # /projects/compbio/data/microarray/affyGnfMouse/sequences/
 
     # Run cluster job to do alignments
     ssh kk
     mkdir -p /cluster/data/mm9/bed/affyU74/run/psl
     cd /cluster/data/mm9/bed/affyU74/run
     cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
     ls -1 /iscratch/i/affy/U74*consensus.fa > affy.list
     cat << '_EOF_' > template
 #LOOP
 blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 genome.list affy.list template jobList
     para create jobList
     para try ... check ... push ... etc.
     para time
 # Completed: 105 of 105 jobs
 # CPU time in finished jobs:       5891s      98.18m     1.64h    0.07d  0.000 y
 # IO & Wait Time:                   738s      12.31m     0.21h    0.01d  0.000 y
 # Average job time:                  63s       1.05m     0.02h    0.00d
 # Longest finished job:             199s       3.32m     0.06h    0.00d
 # Submission to last job:           215s       3.58m     0.06h    0.00d
 
 # Do sort, best in genome filter, and convert to chromosome coordinates
 # to create affyU74.psl.
     pslSort dirs raw.psl tmp psl
 
 # change filter parameters for these sequences. only use alignments that
 # cover 30% of sequence and have at least minAli = 0.95.
 # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
 #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl \
 	../all_affyU74.psl /dev/null
     #	Processed 40512 alignments
 
 # Sort by chromosome and load into database.
     ssh hgwdev
     cd /cluster/data/mm9/bed/affyU74
     pslSortAcc nohead chrom temp all_affyU74.psl
     #	Processed 30609 lines into 1 temp files
     cat chrom/*.psl > affyU74.psl
 
 # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 # and reload data into table
 
     mv affyU74.psl affyU74.psl.orig
 
     cut -f 1-9 affyU74.psl.orig >j1.tmp
     cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
     cut -f 11-21 affyU74.psl.orig >j3.tmp
     paste j1.tmp j2.tmp j3.tmp >affyU74.psl
 
     hgLoadPsl mm9 affyU74.psl
     rm -rf chrom temp run j?.tmp
 
     #	creating the gene sorter tables runs the following:
     hgMapToGene mm9 affyU74  knownGene knownToU74
 
 ############################################################################
 ##   MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
 # Make bed files and load consensus sequences for Affy U74 chip set.
 
     #This needs to be done after affyU74 is already made.
     ssh hgwdev
     mkdir -p /cluster/data/mm9/bed/affyGnf
     cd /cluster/data/mm9/bed/affyGnf
 #	may need to build this command in src/hg/affyGnf
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
 	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
 # 89 experiments
 # 10043 rows of expression data
 # 30609 records in ../affyU74/affyU74.psl
 # 10309 records written to affyGnfU74A.bed
 
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
 	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
 # 20 experiments
 # 12477 rows of expression data
 # 30609 records in ../affyU74/affyU74.psl
 # 11324 records written to affyGnfU74B.bed
 
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
 	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
 # 20 experiments
 # 11934 rows of expression data
 # 30609 records in ../affyU74/affyU74.psl
 # 7773 records written to affyGnfU74C.bed
 
 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 #   (these files do not appear to have these long names in them to begin with)
     mkdir sav
     mv *.bed sav
     sed -e "s/U74Av2://" sav/affyGnfU74A.bed > affyGnfU74A.bed
     sed -e "s/U74Bv2://" sav/affyGnfU74B.bed > affyGnfU74B.bed
     sed -e "s/U74Cv2://" sav/affyGnfU74C.bed > affyGnfU74C.bed
 
     # and reload data into table
     hgLoadBed mm9 affyGnfU74A affyGnfU74A.bed
     #	Loaded 10309 elements of size 15
     hgLoadBed mm9 affyGnfU74B affyGnfU74B.bed
     #	Loaded 11324 elements of size 15
     hgLoadBed mm9 affyGnfU74C affyGnfU74C.bed
     #	Loaded 7773 elements of size 15
 
     # Add in sequence data for U74 tracks.
     #	This business is already in gbdb - 2007-00-10 - Hiram
     #	You do not need to repeat this symlink sequence
     # Copy consensus sequence to /gbdb if it isn't already
     #    mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     # fix broken symlinks after directory structure changed
     # /projects/compbiodata ----> /projects/compbio/data
     rm U74*
     # make correct symlinks (hartera, 2005-05-03)
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
 
     # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
     # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
     # reload sequences with prefix removed so acc matches name used in
     # other dependent tables
                                                     
     hgLoadSeq -abbr=U74Av2: mm9 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
     #	12422 sequences
     hgLoadSeq -abbr=U74Bv2: mm9 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
     #	12411 sequences
     hgLoadSeq -abbr=U74Cv2: mm9 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
     #	11868 sequences
 
     #	building the gene sorter runs the following commands
     hgExpDistance mm9 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
 	-lookup=knownToU74
     #	real    7m6.223s
     #	Have 9636 elements in affyGnfU74A
     #	Got 15902 unique elements in affyGnfU74A
     hgExpDistance mm9 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
 	-lookup=knownToU74
     #	real    2m12.727s
     #	Have 11025 elements in affyGnfU74B
     #	Got 10442 unique elements in affyGnfU74B
     hgExpDistance mm9 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
 	-lookup=knownToU74
     #	real    0m29.270s
     #	Have 7487 elements in affyGnfU74C
     #	Got 3259 unique elements in affyGnfU74C
 
 ##########################################################################
 # BUILD NIBB IMAGE PROGES (DONE - 2007-09-10 - Hiram)
     ssh pk
     mkdir -p /cluster/data/mm9/bed/nibbPics/run
     cd /cluster/data/mm9/bed/nibbPics
     cp -p /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
     cd run
     mkdir psl
     ls -1 /scratch/data/mm9/nib/*.nib > genome.list
     echo ../nibbImageProbes.fa > probe.list
 
 # Create parasol gensub file file
 cat << '_EOF_' > template
 #LOOP
 blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
 # Create parasol batch
     gensub2 genome.list probe.list template jobList
     para create jobList
     para try ... check ... push ... etc... time
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:       9983s     166.39m     2.77h    0.12d  0.000 y
 # IO & Wait Time:                   146s       2.43m     0.04h    0.00d  0.000 y
 # Average job time:                 289s       4.82m     0.08h    0.00d
 # Longest finished job:             729s      12.15m     0.20h    0.01d
 # Submission to last job:           729s      12.15m     0.20h    0.01d
 
 # Make sort and filter
     catDir psl | sort -k 10 \
         | pslReps stdin stdout /dev/null -nohead -minAli=0.60 \
 		-nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
 	| sort -k 14,14 -k 16,16n \
 	| sed 's#/scratch/data/mm9/nib/chr#chr#' \
 	| sed 's/.nib//' > ../nibbImageProbes.psl
 
 # Make bed file and copy in stuff
     ssh hgwdev
     cd /cluster/data/mm9/bed/nibbPics
 
 # Load into database
     ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
 	/gbdb/mm9/nibbImageProbes.fa
     hgLoadSeq mm9 /gbdb/mm9/nibbImageProbes.fa
     hgLoadPsl mm9 nibbImageProbes.psl
 
 #########################################################################
 # Creating visiGene tables for gene sorter business
 #	(DONE - 2007-09-10 - Hiram)
     #	This businesss has cumulative effects on the visiGene database
     #	for safety purposes, backup the visiGene database
     ssh hgwdev
     mkdir -p /cluster/data/mm9/bed/vgProbes/visiGene.bak
     cd /cluster/data/mm9/bed/vgProbes/visiGene.bak
     hgsqldump --all -c --tab=. visiGene
 
     cd /cluster/data/mm9/bed/vgProbes
     mkdir working
     cd /cluster/data/mm9/bed/vgProbes
     cp -p ~/kent/src/hg/visiGene/vgProbeTrack/*.sql .
     #	this SEQ appears to find nothing new ?
     vgProbeTrack SEQ working mm9
 rc = 0 = count of primers for mrna search for taxon 10090
 rc = 0 = count of primers for genome search for taxon 10090
 bac list read done.
 found seq for 0 bacEndPairs
 rc = 0 = count of refSeq mrna for mm9
 rc = 0 = count of genRef mrna for mm9
 rc = 0 = count of genbank mrna for mm9
 rc = 0 = count of flatRef mrna for mm9
 rc = 0 = count of flatAll mrna for mm9
 rc = 0 = count of linkRef mrna for mm9
 rc = 0 = count of linkAll mrna for mm9
 rc = 0 = count of kgAlRef mrna for mm9
 rc = 0 = count of kgAlAll mrna for mm9
 
     #	and then, this creates the vgProbes table in mm9
     vgProbeTrack ALI working mm9 -sqlPath=..
     hgsql -e "select count(*) from vgProbes;" mm9
     #	24924
     hgsql -e "select count(*) from vgProbes;" mm8
     #	24615
 
     #	this appears to build working/vgPrbExt.fa and it loaded some sequences
     vgProbeTrack EXT working mm9
     #	this copies over all the items from vgProbes to start vgAllProbes
     vgProbeTrack SELFMAP working mm9 -sqlPath=..
     #	this adds frog alignments to vgAllProbes
     vgProbeTrack -sqlPath=.. REMAP working mm9 nibb nibbImageProbes \
 	/gbdb/mm9/nibbImageProbes.fa
     hgsql -e "select count(*) from vgAllProbes;" mm9
     #	26289
     hgsql -e "select count(*) from vgAllProbes;" mm8
     #	25994
 
     #	finally, gathering together all alignments used and updates seq table
     vgProbeTrack EXTALL working mm9
 
     #	Then, during the gene sorter build, it does:
     knownToVisiGene mm9
     vgGetText visiGene.text mm7 mm8 mm9 hg17 hg18
     #	probe has 26611 rows
     #	gene has 20413 rows
     #	imageProbe has 125765 rows
     wc -l visiGene.text
     #	124186 visiGene.text
     #	compare to existing:
     wc -l /usr/local/apache/cgi-bin/visiGeneData/visiGene.text
     #	124186 /usr/local/apache/cgi-bin/visiGeneData/visiGene.text
 
 #########################################################################
 # Create Allen Brain Atlas mapping. (DONE - 2007-09-24 - Hiram)
 
 # Set up directory
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/allenBrain
     cd /cluster/data/mm9/bed/allenBrain
 
     # find most recent update of allProbes.fa to use for these alignments
 
     cp -p /cluster/data/mm6/bed/allenBrain/allProbes.fa ./allenBrainProbes.fa
     cp -p /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .
 
 # Set up a blat run to align the probes.
     mkdir split
     faSplit sequence allenBrainProbes.fa 200 split/rp
     mkdir run
     ssh pk
     cd /cluster/data/mm9/bed/allenBrain/run
     ls -1 ../split/*.fa > probe.list
     ls -1 /scratch/data/mm9/nib/*.nib > genome.list
     mkdir psl
     cat << '_EOF_' > template
 #LOOP
 runBlat $(path1) $(path2) $(root1) $(root2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     cat << '_EOF_' > runBlat
 #!/bin/csh -ef
 set ooc = /scratch/data/mm9/11.ooc
 set tmpDir = /scratch/tmp/mm9
 set workDir = $tmpDir/$3_$4
 set pslOut = $3_$4.psl
 mkdir -p $tmpDir
 mkdir $workDir
 blat -ooc=$ooc $1 $2 $workDir/$pslOut
 mv $workDir/$pslOut psl/$pslOut
 rmdir $workDir
 rmdir --ignore-fail-on-non-empty $tmpDir
 '_EOF_'
     # << happy emacs
     chmod +x runBlat
 
     gensub2 genome.list probe.list template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 6790 of 6790 jobs
 # CPU time in finished jobs:      28129s     468.81m     7.81h    0.33d  0.001 y
 # IO & Wait Time:                 23014s     383.57m     6.39h    0.27d  0.001 y
 # Average job time:                   8s       0.13m     0.00h    0.00d
 # Longest finished job:              29s       0.48m     0.01h    0.00d
 # Submission to last job:           363s       6.05m     0.10h    0.00d
 
 # Then do sorting and near-best-in-genome step on file server
     ssh kkstore06
     cd /cluster/data/mm9/bed/allenBrain/run
     pslSort dirs raw.psl tmp psl
     pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 \
 	-nearTop=0.001 /dev/null
     #	Processed 63183 alignments
     sort -k14,14 -k16,16n ../best.psl > ../allenBrainAli.psl
 
 # Clean up big files no longer needed
    rm raw.psl batch.bak
    rm -r psl
    rm -r ../split
 
 # Load up database
    ssh hgwdev
    cd /cluster/data/mm9/bed/allenBrain
 
 # Make a new table that contains the URLs for the allen brain genes
 # Make this one first since all.joiner considers it the master table.
    hgsql mm9 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql mm9 -e \
 	'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'
 
 # Make probe alignment table, and load sequence.
    hgLoadPsl mm9 allenBrainAli.psl
    mkdir /gbdb/mm9/allenBrain
    ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa \
 	/gbdb/mm9/allenBrain/allenBrainProbes.fa
    hgLoadSeq -replace mm9 /gbdb/mm9/allenBrain/allenBrainProbes.fa
 
 # Make mapping between known genes and allenBrain	
    hgMapToGene mm9 allenBrainAli -type=psl knownGene knownToAllenBrain 
 
 #########################################################################
 # MOUSE AFFYMETRIX MOE430 TRACK (DONE - 2007-09-10 - Hiram)
 #    mkdir -p /projects/compbio/data/microarray/affyMouse
     # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
     # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
 #    unzip MOE430*_consensus.zip
 
     # check for duplicate probes: there are none, all have unique names
     # check for duplicate probes: 100 from 136745_at to 1367551_a_at
     # remove "consensus:" and ";" from FASTA headers to shorten probeset
     # names for database
 
 #    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
 #    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
  
 #    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
 #       /cluster/bluearc/affy/
 
     # THE ABOVE WAS ALREADY TBD)
 
     # Set up cluster job to align MOE430 consensus sequences to mm9
 
     ssh kk
     mkdir /cluster/data/mm9/bed/affyMOE430
     cd /cluster/data/mm9/bed/affyMOE430
 
     ls -1 /iscratch/i/affy/MOE430_all.fa > probe.list
     cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
 
     cat << '_EOF_' > template
 #LOOP
 blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 genome.list probe.list template jobList
     mkdir psl
     para create jobList
     # Do the job with usual para try/check/push/time etc.
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:       9093s     151.55m     2.53h    0.11d  0.000 y
 # IO & Wait Time:                   217s       3.62m     0.06h    0.00d  0.000 y
 # Average job time:                 266s       4.43m     0.07h    0.00d
 # Longest finished job:             602s      10.03m     0.17h    0.01d
 # Submission to last job:           602s      10.03m     0.17h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyRAE230.psl
     pslSort dirs raw.psl tmp psl
 
     # only use alignments that cover 30% of sequence and have at least
     # 95% identity in aligned region. 
     # low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
 	raw.psl affyMOE430.psl /dev/null
 
     # Load alignments and sequences into database
     ssh hgwdev
     cd /cluster/data/mm9/bed/affyMOE430
     # shorten names in psl file
     sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
     mv affyMOE430.psl.bak affyMOE430.psl
 
     # load track into database
 
     hgLoadPsl mm9 affyMOE430.psl
  
     # Add consensus sequences for MOE430
     # Copy sequences to gbdb is they are not there already
 #    mkdir -p /gbdb/hgFixed/affyProbes
 #    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
 #       /gbdb/hgFixed/affyProbes
 
     hgLoadSeq -abbr=MOE430 mm9 /gbdb/hgFixed/affyProbes/MOE430_all.fa
     
     # Clean up
     rm batch.bak raw.psl 
 
     #	and then, during the gene sorter build, it does:
     hgMapToGene mm9 affyMOE430 knownGene knownToMOE430
     hgMapToGene mm9 affyMOE430 -prefix=A: knownGene knownToMOE430A
 
 #########################################################################
 #  creating UCSC genes track (DONE - 2007-08-31 - 2007-09-25 - Hiram)
     #  working on the script mm9.ucscGenes10.csh in src/hg/makeDb/doc
     #	The tracks created above were done as they were encountered
     #	in working through that script.   Worked through that script
     #	approximately one kluster run at a time, using a large if (1 == 0)
     #	statement to skip over business that had been successfully completed.
     #	After it reached the point where it had begun to load the tables
     #	into the tempDb and started to fail at the missing tables affyGnf1m
     #	the successfully loaded tables in tempDb were moved to mm9 and
     #	the track began to function.  Then, working through the affy
     #	alignments above, and completing the loading of the knownTo tables
     #	for the gene sorter as they were completed.  Now continuing below
     #	with the rest of the steps manually since it is not necessary to
     #	use the tempDb and its /gbdb/ directory.  Everything is not taking
     #	place in the mm9 database.
 
     # example script to transfer appropriate tables from one DB to another
     # while saving the first set too
 
 hgsql -N -e "show tables;" mm9UCGenes | \
 egrep -v "allenBrainAli|allenBrainUrl|extFile|knownToEnsembl|vgProbes|vgAllProbe
 s|^seq$|trackDb|history|chromInfo" | while read T
 do
     echo -n "=== table ${T}: "
     C1=`hgsql -N -e "select count(*) from ${T}" mm9`
     C2=`hgsql -N -e "select count(*) from ${T}" mm9UCGenes 2> /dev/null`
     D=`echo "${C1}" "${C2}" | awk '{printf "%d", $2-$1}'`
     echo "${C1} - ${C2} - ${D}"
     echo "rename table mm9.${T} to mm9UCGenes.${T}_try0"
     echo "rename table mm9UCGenes.${T} to mm9.${T}"
     hgsql -e "rename table mm9.${T} to mm9UCGenes.${T}_try0" mysql
     hgsql -e "rename table mm9UCGenes.${T} to mm9.${T}" mysql
 done
     #	The egrep -v knocks out tables that are redundant, should be the same
     #	in both DBs
 
 #########################################################################
 # running the blastP operation to the other genomes for the gene sorter
 #	(DONE - 2007-09-10 - Hiram)
     mkdir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
     cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
 
     cat << '_EOF_' > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix known
 targetDb mm9
 queryDbs hg18 rn4 danRer4 dm2 ce4 sacCer1
 
 mm9Fa /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa
 hg18Fa /cluster/data/hg18/bed/blastp/known.faa
 rn4Fa /cluster/data/rn4/bed/blastp/known.faa
 danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
 ce4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 
 buildDir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
 scratchDir /san/sanvol1/scratch/mm9/jkgHgNearBlastp
 '_EOF_'
     # << happy emacs
     #	takes about an hour
     time nice -n +19 doHgNearBlastp.pl config.ra > do.log 2>&1 &
 
 #########################################################################
 # fixup the blastP tables to remove non-syntenic hits
 #	(DONE - 2007-09-11 - Hiram)
 #  This was all re-done 2007-09-25, see below:
 ######  Update blast tabs after UCSC genes rebuild (DONE - 2007-09-25 - Hiram)
 # Remove non-syntenic hits for human and rat
 # Takes a few minutes
     cd /cluster/data/mm9/bed/ucsc.10
     synBlastp.csh mm9 rn4
 # old number of unique query values: 31610
 # old number of unique target values 7072
 # new number of unique query values: 13973
 # new number of unique target values 6888
     synBlastp.csh mm9 hg18
 # old number of unique query values: 38136
 # old number of unique target values 17214
 # new number of unique query values: 0
 # new number of unique target values 0
 
     # Make reciprocal best subset for the blastp pairs that are too
     # Far for synteny to help
     cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
     # Us vs. others
 foreach otherDb (danRer4 dm2 ce4 sacCer1)
     set aToB = run.mm9.$otherDb
     set bToA = run.$otherDb.mm9
     cat $aToB/out/*.tab > $aToB/all.tab
     cat $bToA/out/*.tab > $bToA/all.tab
     blastRecipBest $aToB/all.tab $bToA/all.tab \
 	$aToB/recipBest.tab $bToA/recipBest.tab
     hgLoadBlastTab mm9 drBlastTab $aToB/recipBest.tab
     hgLoadBlastTab $otherDb tfBlastTab $bToA/recipBest.tab
 end
     # Clean up
     cat run.mm9.mm9/out/*.tab | gzip -c > run.mm9.mm9/all.tab.gz
     cat run.mm9.hg18/out/*.tab | gzip -c > run.mm9.hg18/all.tab.gz
     cat run.hg18.mm9/out/*.tab | gzip -c > run.hg18.mm9/all.tab.gz
     cat run.mm9.rn4/out/*.tab | gzip -c > run.mm9.rn4/all.tab.gz
     cat run.rn4.mm9/out/*.tab | gzip -c > run.rn4.mm9/all.tab.gz
     gzip run.*/all.tab
     rm -r run.*/out
 
 #########################################################################
 #  Update BLASTTAB blast tabs after UCSC genes rebuild
 ##	(DONE - 2007-09-25 - Hiram)
     sh hgwdev
     mkdir -p /cluster/data/mm9/bed/hgNearBlastp/070924
     cd /cluster/data/mm9/bed/hgNearBlastp/070924
     # Get the proteins used by all hgNear organisms:
     pepPredToFa hg18 knownGenePep hg18.known.faa
     pepPredToFa mm9 knownGenePep mm9.known.faa
     pepPredToFa rn4 knownGenePep rn4.known.faa
     pepPredToFa danRer4 ensPep danRer4.ensPep.faa
     pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
     pepPredToFa ce4 sangerPep ce4.sangerPep.faa
     pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa
 
     cat << '_EOF_' > config.ra
 # Latest human vs. other Gene Sorter orgs:
 # mouse, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix known
 targetDb mm9
 queryDbs hg18 rn4 danRer4 dm3 ce4 sacCer1
 recipBest         danRer4 dm3 ce4 sacCer1
 
 mm9Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/mm9.known.faa
 hg18Fa    /cluster/data/mm9/bed/hgNearBlastp/070924/hg18.known.faa
 rn4Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/rn4.known.faa
 danRer4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/danRer4.ensPep.faa
 dm3Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/dm3.flyBasePep.faa
 ce4Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/ce4.sangerPep.faa
 sacCer1Fa /cluster/data/mm9/bed/hgNearBlastp/070924/sacCer1.sgdPep.faa
 
 buildDir /cluster/data/mm9/bed/hgNearBlastp/070924
 scratchDir /san/sanvol1/scratch/mm9HgNearBlastp
 '_EOF_'
     # << happy emacs
 
     # Run with -noLoad so we can eyeball files, manually load mm9 tables now,
     # and after release of mm9 Gene Sorter on the RR, overload other 
     # databases' mmBlastTab tables.
     time nice -n +19 doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 &
     tail -f do.log
 
 Follow instructions at end of do.log, piecewise:
   - first execute all of the run.mm9.* load scripts
   - then execute the run.hg18.mm9 and run.rn4.mm9 scripts
   - then run Galt's script (this is why we load hg18 and rn4 early):
     synBlastp.csh mm9 hg18
     synBlastp.csh mm9 rn4
   -- The following was performed 2007-10-11
   - After mm9 hgNear/Gene Sorter is enabled on the RR:
     - run the remaining run.*.mm9 load scripts
     - then modify each $queryDb's hgGeneData/$org/$queryDb/otherOrg.ra
       to specify mm9 for mouse
     - then do a push request for $queryDbs.mmBlastTab and hgGeneData
 
 #########################################################################
 # MAKE FOLDUTR TABLES  (DONE - 2007-09-11 - Hiram)
 # First set up directory structure and extract UTR sequence on hgwdev
 #	Beware running this on pk since the program RNAfold which is used
 #	during this process is only found on /cluster/bin/i386/
 #	And there is no way for this cluster setup to verify success
 #	of that program since it is hidden away in rnaFoldBig
 #	Need to fix rnaFoldBig to recognize RNAfold missing ...
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/ucsc.10/rnaStruct
     cd /cluster/data/mm9/bed/ucsc.10/rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa mm9 knownGene utr3 utr3/utr.fa
     utrFa mm9 knownGene utr5 utr5/utr.fa
 
     # Split up files and make files that define job.
     faSplit sequence utr3/utr.fa 10000 utr3/split/s
     faSplit sequence utr5/utr.fa 10000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > template << '_EOF_'
 #LOOP
 rnaFoldBig split/$(path1) fold
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 in.lst single template jobList
     cp -p template ../utr5
     cd ../utr5
     gensub2 in.lst single template jobList
 
     ssh kk
     cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr3
     para make jobList
 # Completed: 9750 of 9750 jobs
 # CPU time in finished jobs:     377924s    6298.73m   104.98h    4.37d  0.012 y
 # IO & Wait Time:                 38985s     649.75m    10.83h    0.45d  0.001 y
 # Average job time:                  43s       0.71m     0.01h    0.00d
 # Longest finished job:            3432s      57.20m     0.95h    0.04d
 # Submission to last job:         11280s     188.00m     3.13h    0.13d
     cd ../utr5
     para make jobList
 # Completed: 9253 of 9253 jobs
 # CPU time in finished jobs:      44949s     749.16m    12.49h    0.52d  0.001 y
 # IO & Wait Time:                 51547s     859.11m    14.32h    0.60d  0.002 y
 # Average job time:                  10s       0.17m     0.00h    0.00d
 # Longest finished job:            1100s      18.33m     0.31h    0.01d
 # Submission to last job:          1398s      23.30m     0.39h    0.02d
 
     # Load database
     ssh hgwdev
     cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr5
     hgLoadRnaFold mm9 foldUtr5 fold
     #	Parsed 35796 files
     cd ../utr3
     hgLoadRnaFold -warnEmpty mm9 foldUtr3 fold
     #	only one is empty: uc009gyo.1
     # Seems to be a problem in
     # RNAfold, so not easy for us to fix. Consequence is not too bad, just a
     # few 3' UTRs will be missing annotation.  (in this case, only one)
 
     # Clean up
     tar cvzf ./fold.tgz ./fold
     rm -r split fold err batch.bak
     cd ../utr5
     tar cvzf ./fold.tgz ./fold
     rm -r split fold err batch.bak
 #########################################################################
 # Make pfam run.  Actual cluster run is about 6 hours.
 #	(DONE - 2007-09-12 - Hiram)
 # First get pfam global HMMs into /san/sanvol1/pfam somehow.
     ssh pk
     mkdir /san/sanvol1/scratch/mm9/ucscGenes
     cd /san/sanvol1/scratch/mm9/ucscGenes
     mkdir splitProt
     faSplit sequence /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa \
 	10000 splitProt/
     mkdir pfam
     cd pfam
     mkdir out
     ls -1 ../splitProt > gene.list
     cat << '_EOF_' > doPfam
 #!/bin/csh -ef
 /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/pfam/Pfam_fs $1 \
 	> /scratch/tmp/mm9.$2
 mv /scratch/tmp/mm9.$2 $3
 '_EOF_'
     # << happy emacs
     chmod a+x doPfam
     cat << '_EOF_' > template
 #LOOP
 doPfam ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 gene.list single template jobList
     para create jobList
     para try ... check ... push ... etc... time
     #	after some kluster difficulties
 Completed: 9666 of 9666 jobs
 CPU time in finished jobs:    3535078s   58917.96m   981.97h   40.92d  0.112 y
 IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 Average job time:                 287s       4.78m     0.08h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:            3430s      57.17m     0.95h    0.04d
 Submission to last job:         79051s    1317.52m    21.96h    0.91d
 
     # Make up pfamDesc.tab by converting pfam to a ra file first
     cat << '_EOF_' > makePfamRa.awk
 /^NAME/ {print}
 /^ACC/ {print}
 /^DESC/ {print; printf("\n");}
 '_EOF_'
     # << happy emacs
 
     awk -f makePfamRa.awk  /cluster/store12/pfam/Pfam_fs > pfamDesc.ra
     raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout | \
    awk -F '\t' '{
 printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3);
 }' > pfamDesc.tab
 
     # Convert output to tab-separated file. 
     cd /cluster/data/mm9/bed/ucsc.10
     catDir /san/sanvol1/scratch/mm9/ucscGenes/pfam/out \
 	| hmmPfamToTab -eValCol stdin ucscPfam.tab
 
     # Convert output to knownToPfam table
     awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' \
 	/san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab > sub.foo
     cut -f 1,4 ucscPfam.tab | subColumn 2 stdin sub.foo knownToPfam.tab
     hgLoadSqlTab mm9 knownToPfam ~/kent/src/hg/lib/knownTo.sql \
 	knownToPfam.tab
     cut -f 1-4 ucscPfam.tab > load.ucscPfam.tab
     hgLoadSqlTab mm9 ucscPfam ~/kent/src/hg/lib/ucscPfam.sql load.ucscPfam.tab
     cp -p /san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab .
     hgLoadSqlTab mm9 pfamDesc ~/kent/src/hg/lib/pfamDesc.sql pfamDesc.tab
 
 #########################################################################
 # Do scop run. Takes about 3.5 hours (DONE - 2007-09-12 - Hiram)
 # First get pfam global HMMs into /san/sanvol1/scop somehow.
     ssh pk
     mkdir /san/sanvol1/scratch/mm9/ucscGenes/scop
     cd /san/sanvol1/scratch/mm9/ucscGenes/scop
     mkdir out
     ls -1 ../splitProt > gene.list
     cat << '_EOF_' > doScop
 #!/bin/tcsh -ef
 /san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/scop/scop.hmm $1 \
 	> /scratch/tmp/mm9.$2
 mv /scratch/tmp/mm9.$2 $3
 '_EOF_'
     chmod a+x doScop
     cat << '_EOF_' > template
 #LOOP
 doScop ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
 #ENDLOOP
 '_EOF_'
     gensub2 gene.list single template jobList
     para create jobList
     para try ... check ... push ... etc... time
 # Completed: 9666 of 9666 jobs
 # CPU time in finished jobs:    3532425s   58873.76m   981.23h   40.88d  0.112 y
 # IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
 # Average job time:                 347s       5.78m     0.10h    0.00d
 # Longest finished job:            6512s     108.53m     1.81h    0.08d
 # Submission to last job:         12348s     205.80m     3.43h    0.14d
 
 
     # Convert scop output to tab-separated files
     ssh hgwdev
     cd /cluster/data/mm9/bed/ucsc.10
     catDir /san/sanvol1/scratch/mm9/ucscGenes/scop/out | \
 	hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab
     scopCollapse scopPlusScore.tab /cluster/store12/scop/model.tab \
 	ucscScop.tab scopDesc.tab knownToSuper.tab
     hgLoadSqlTab mm9 knownToSuper ~/kent/src/hg/lib/knownToSuper.sql \
 	knownToSuper.tab
 
     hgLoadSqlTab mm9 ucscScop ~/kent/src/hg/lib/ucscScop.sql ucscScop.tab
     hgLoadSqlTab mm9 scopDesc ~/kent/src/hg/lib/scopDesc.sql scopDesc.tab
 
     # XXX - ccds is not yet available for Mm9 according to Mark
     # Regenerate ccdsKgMap table
     # /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm9 -loadDb \
     #	mm9.ccdsGene knownGene ccdsKgMap
 
     # Map old to new mapping - maybe next time, this is first genes on mm9
     # hgsql mm9 -N -e 'select * from knownGene' > knownGene_1.gp
     # genePredToBed knownGene_1.gp >knownGene_1.bed
     # cat refSeq/*.bed mrna/*.bed | txGeneExplainUpdate1 knownGene_1.bed \
     #	ucscGenes.bed stdin abWalk.bed kg2ToKg3.bed
     # hgLoadSqlTab $tempDb kg1ToKg2 ~/kent/src/hg/lib/kg2ToKg3.sql kg2ToKg3.bed
 
     # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables.
 
     hgsql mm9 -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
          
     hgsql mm9 -N -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     sort -u j.tmp > kgSpAlias.tab
     rm j.tmp
 
     hgLoadSqlTab mm9 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql ./kgSpAlias.tab
 
 #########################################################################
 # Building PROTEOME BROWSER TABLES (DONE - 2007-09-12 - Hiram)
 
 # These are instructions for building tables 
 # needed for the Proteome Browser. 
  
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap2 table
 # ARE REBUILT.  
 # This build is based on proteins DBs dated 070202.
 
 # Create the working directory
 
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/ucsc.10/pb
     cd /cluster/data/mm9/bed/ucsc.10/pb
 
     # Build the pepMwAa table
 
     hgsql proteins070202 -N -e \
 "select info.acc, molWeight, aaSize from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
     hgLoadSqlTab mm9 pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab
 
     # Build the pepPi table
 
     hgsql proteins070202 -e \
     "select info.acc from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.list
 
     hgsql mm9 -N \
 -e 'select proteinID from knownGene where proteinID like "%-%"' \
 	| sort -u >> protAcc.list
 
     pbCalPi protAcc.list sp070202 pepPi.tab
     hgLoadSqlTab mm9 pepPi ~/kent/src/hg/lib/pepPi.sql ./pepPi.tab
 
     # Calculate and load pep distributions
 
     pbCalDist sp070202 proteins070202 10090 mm9 
     hgLoadSqlTab mm9 pepExonCntDist ~/kent/src/hg/lib/pepExonCntDist.sql \
 	./pepExonCntDist.tab
     hgLoadSqlTab mm9 pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql \
 	./pepCCntDist.tab
     hgLoadSqlTab mm9 pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql \
 	./pepHydroDist.tab
     hgLoadSqlTab mm9 pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql \
 	./pepMolWtDist.tab
     hgLoadSqlTab mm9 pepResDist ~/kent/src/hg/lib/pepResDist.sql \
 	./pepResDist.tab
     hgLoadSqlTab mm9 pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql \
 	./pepIPCntDist.tab
     hgLoadSqlTab mm9 pepPiDist ~/kent/src/hg/lib/pepPiDist.sql ./pepPiDist.tab
 
 
 # Calculate frequency distributions
 
     pbCalResStd sp070202 10090 mm9
 
 # Create pbAnomLimit and pbResAvgStd tables
 
     hgLoadSqlTab mm9 pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql \
 	./pbAnomLimit.tab
     hgLoadSqlTab mm9 pbResAvgStd ~/kent/src/hg/lib/pbResAvgStd.sql \
 	./pbResAvgStd.tab
 
     hgsql -N -e "select * from pbStamp;" mm8 > pbStamp.tab
     hgLoadSqlTab mm9 pbStamp ~/kent/src/hg/lib/pbStamp.sql \
 	./pbStamp.tab
 
     #	Turn on protein and gene sorter
     hgsql -e 'update dbDb set hgNearOk=1,hgPbOk=1 where name="mm9";' \
 	hgcentraltest
 
 # Add mm9 to gdbPdb, pointing to proteins070202
 
     mysql> insert into gdbPdb values('mm9','proteins070202');
 
 ############################################################################
 # BUILD KNOWN GENE LIST FOR GOOGLE.   (DONE - 2007-10-03 - Hiram)
 
     cd /cluster/data/mm9/bed
     rm -rf knownGeneList/mm9
 
     # Run hgKnownGeneList to generate the tree of HTML pages
     # under ./knownGeneList/mm9
 
     hgKnownGeneList mm9
 
     # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/mm9
     rsync -a --progress ./knownGeneList/mm9/ \
 	/usr/local/apache/htdocs/knownGeneList/mm9/
     #	if this is a new listing, add it to the top level
     #	knownGeneLists.html file
 
 ############################################################################
 # SGP GENES (DONE - 2007-10-01 - Hiram)
     ssh kkstore06
     mkdir  /cluster/data/mm9/bed/sgp
     cd  /cluster/data/mm9/bed/sgp
 
     #   They don't do chrM  (we could just let that on fail ...)
     for C in `awk '{print $1}' /cluster/data/mm9/chrom.sizes | grep -v chrM`
     do
         wget --timestamping \
 "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/SGP/humangp200603/${C}.gtf" \
         -O "${C}.gtf"
     done
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/sgp
     ldHgGene -gtf -genePredExt mm9 sgpGene chr*.gtf
     #	Read 35983 transcripts in 290486 lines in 34 files
     #	35983 groups 32 seqs 1 sources 3 feature types
     #	35983 gene predictions
 
     featureBits mm9 -enrichment refGene:CDS sgpGene
     #	refGene:CDS 1.165%, sgpGene 1.439%, both 1.005%, cover 86.28%,
     #	enrich 59.96x
     featureBits mm8 -enrichment refGene:CDS sgpGene
     #	refGene:CDS 1.186%, sgpGene 1.455%, both 1.025%, cover 86.47%,
     #	enrich 59.42x
     featureBits mm9 -enrichment knownGene:CDS sgpGene
     #	knownGene:CDS 1.278%, sgpGene 1.439%, both 1.080%, cover 84.53%,
     #	enrich 58.74x
     featureBits mm8 -enrichment knownGene:CDS sgpGene
     #	knownGene:CDS 1.109%, sgpGene 1.455%, both 0.931%, cover 83.98%,
     #	enrich 57.71x
 
 #####################################################################
 # LOAD GENEID GENES (DONE - 2007-10-01 - Hiram)
     ssh kkstore06
     mkdir -p /cluster/data/mm9/bed/geneid/download
     cd /cluster/data/mm9/bed/geneid/download
 
     bash
     awk '{print $1}' ../../../chrom.sizes | while read C
     do
       echo $C
       wget --timestamping \
 "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.gtf" \
 	-O ${C}.gtf
       wget --timestamping \
 "http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.prot" \
 	-O ${C}.prot
     done
     exit
 
     # Add missing .1 to protein id's
 
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
     end
     ssh hgwdev
     cd /cluster/data/mm9/bed/geneid
     ldHgGene -genePredExt -gtf mm9 geneid download/*.gtf
 # Read 36708 transcripts in 287399 lines in 35 files
 # 36708 groups 34 seqs 1 sources 3 feature types
 # 36708 gene predictions
 
     #	the chr16_random file is empty, do not attempt to use it
     hgPepPred mm9 generic geneidPep \
 	`ls download/*-fixed.prot | grep -v chr16_random`
     featureBits mm9 -enrichment refGene geneid
 # refGene 1.975%, geneid 1.590%, both 0.956%, cover 48.39%, enrich 30.44x
     featureBits mm8 -enrichment refGene geneid
 # refGene 2.010%, geneid 1.592%, both 0.974%, cover 48.44%, enrich 30.43x
     featureBits mm7 -enrichment refGene geneid
 # refGene 2.002%, geneid 1.579%, both 0.952%, cover 47.57%, enrich 30.12x
 
     featureBits mm9 -enrichment knownGene geneid
 # knownGene 2.686%, geneid 1.590%, both 1.047%, cover 38.97%, enrich 24.52x
     featureBits mm8 -enrichment knownGene geneid
 # knownGene 2.130%, geneid 1.592%, both 0.900%, cover 42.23%, enrich 26.53x
     featureBits mm7 -enrichment knownGene geneid
 # knownGene 2.058%, geneid 1.579%, both 0.859%, cover 41.72%, enrich 26.42x
 
 #########################################################################
 # BLASTZ/CHAIN/NET Orangutan ponAbe2 (DONE - 2007-09-21 - Hiram)
     ssh kkstore02
     #	use a screen to control this job
     screen
     mkdir /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
     cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
 
     cat << '_EOF_' > DEF
 # mouse vs orangutan
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY: Orangutan ponAbe2
 SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
 SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=10000
 
 BASE=/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-stop=load -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
     #	real    62m34.156s
     #	some pk kluster difficulties, fixup and complete manually
 # Completed: 104880 of 104880 jobs
 # CPU time in finished jobs:    7142978s  119049.64m  1984.16h   82.67d  0.227 y
 # IO & Wait Time:                556393s    9273.21m   154.55h    6.44d  0.018 y
 # Average job time:                  73s       1.22m     0.02h    0.00d
 # Longest finished job:             507s       8.45m     0.14h    0.01d
 # Submission to last job:         65973s    1099.55m    18.33h    0.76d
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=cat -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
     #	real    166m20.442s
     cat fb.mm9.chainPonAbe2Link.txt
     #	914561309 bases of 2620346127 (34.902%) in intersection
 
     #	And, for the swap
     mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
     cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
 	-chainMinScore=3000 -swap -chainLinearGap=medium \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    102m23.209s
     cat fb.ponAbe2.chainMm9Link.txt
     #	948458190 bases of 3093572278 (30.659%) in intersection
 
     # create the syntenic maf nets:
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
     time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
 	-continue=syntenicNet -syntenicNet -chainMinScore=3000 \
 	-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
     #	real    22m16.544s
 
 ########################################################################
 # BLASTZ/CHAIN/NET Frog X. tropicalis xenTro2 (DONE - 2007-09-23 - Hiram)
     ssh kkstore04
     screen # use screen to manage this job
     # XXX note for next time, missing the TMPDIR in the DEF file
     mkdir /cluster/data/mm9/bed/blastzXenTro2.2007-09-19
     cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19
 
     cat << '_EOF_' > DEF
 # Mouse (mm9) vs frog (xenTro2)
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/cluster/data/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=10000
 
 BASE=/cluster/data/mm9/bed/blastzXenTro2.2007-09-19
 '_EOF_'
     # << emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    1050m55.259s
     # after kk difficulties, finishing the first kluster run manually
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
 # Completed: 126539 of 126540 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:   15750656s  262510.93m  4375.18h  182.30d  0.499 y
 # IO & Wait Time:                843281s   14054.69m   234.24h    9.76d  0.027 y
 # Average job time:                 131s       2.19m     0.04h    0.00d
 # Longest finished job:            2039s      33.98m     0.57h    0.02d
 # Submission to last job:         79275s    1321.25m    22.02h    0.92d
 
     #	A single job kept having trouble, finished it on kolossus:
     ssh kolossus
     cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19/run.blastz
 time nice -n +19 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
 /scratch/data/mm9/mm9.2bit:chr2:80000000-90000000 qParts/part008.lst ../DEF \
 ../psl/mm9.2bit:chr2:80000000-90000000/mm9.2bit:chr2:80000000-90000000_part008.lst.psl
     #	continuing after that
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-continue=cat -bigClusterHub=kk -chainMinScore=5000 \
 	-chainLinearGap=loose `pwd`/DEF > cat.out 2>&1 &
     #	real    62m17.627s
     cat fb.mm9.chainXenTro2Link.txt
     #	82054987 bases of 2620346127 (3.131%) in intersection
 
     #	Then to swap over to xenTro2
     mkdir /cluster/data/xenTro2/bed/blastz.mm9.swap
     cd /cluster/data/xenTro2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=kk -chainMinScore=5000 \
 	/cluster/data/mm9/bed/blastzXenTro2.2007-09-19/DEF \
 	-chainLinearGap=loose > swap.out 2>&1 &
     #	real    47m53.428s
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastz.xenTro2.2007-09-19
     time nice -n +19 featureBits mm9 chainXenTro2Link \
 	> fb.mm9.chainXenTro2Link 2>&1 &
     #	68050843 bases of 2567283971 (2.651%) in intersection
     cd /cluster/data/xenTro2/bed/blastz.mm9.swap
     time nice -n +19 featureBits xenTro2 chainMm8Link \
 	> fb.xenTro2.chainMm8Link 2>&1
     #	72840135 bases of 1359412157 (5.358%) in intersection
 
 #########################################################################
 ## BLASTZ Lizard anoCar1 - (DONE - 2007-09-21 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
     cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
 
     cat << '_EOF_' > DEF
 # Mouse (mm9) vs lizard (anoCar1)
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
 SEQ2_LEN=/cluster/data/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=10000
 
 BASE=/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-qRepeats=windowmaskerSdust \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    911m49.918s
     # after kk difficulties, finishing the first kluster run manually
 # Completed: 86355 of 86355 jobs
 # CPU time in finished jobs:   11171051s  186184.18m  3103.07h  129.29d  0.354 y
 # IO & Wait Time:                662082s   11034.70m   183.91h    7.66d  0.021 y
 # Average job time:                 137s       2.28m     0.04h    0.00d
 # Longest finished job:            1467s      24.45m     0.41h    0.02d
 # Submission to last job:         62938s    1048.97m    17.48h    0.73d
     #	continuing
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	DEF -chainMinScore=5000 \
 	-continue=cat -qRepeats=windowmaskerSdust \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
     #	real    31m44.652s
     cat  fb.mm9.chainAnoCar1Link.txt
     #	89239796 bases of 2620346127 (3.406%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
     cd /cluster/data/anoCar1/bed/blastz.mm9.swap
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
 	-swap -qRepeats=windowmaskerSdust \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     #	real    29m12.291s
     cat fb.anoCar1.chainMm9Link.txt
     #	85923556 bases of 1741478929 (4.934%) in intersection
 
 #########################################################################
 # BLASTZ Chicken galGal3 (DONE - 2007-09-25 - Hiram)
     ssh kkstore03
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzGalGal3.2007-09-21
     cd /cluster/data/mm9/bed/blastzGalGal3.2007-09-21
 
     # This partitioning is too large to run on kk, must run this on pk
     #	or change the partitioning
 
     cat << '_EOF_' > DEF
 # mouse vs chicken
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_SMSK=/scratch/data/mm9/notInOthers
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/scratch/hg/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzGalGal3.2007-09-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    587m53.468s
 # Completed: 16680 of 17168 jobs
 # Crashed: 488 jobs
 # CPU time in finished jobs:    7758569s  129309.48m  2155.16h   89.80d  0.246 y
 # IO & Wait Time:                190128s    3168.80m    52.81h    2.20d  0.006 y
 # Average job time:                 477s       7.94m     0.13h    0.01d
 # Longest finished job:            6501s     108.35m     1.81h    0.08d
 # Submission to last job:        271554s    4525.90m    75.43h    3.14d
     #	the kk cluster could not complete some of these jobs.  A recovery job
     #	list was created from the remaining jobs and completed on pk
 # Completed: 488 of 488 jobs
 # CPU time in finished jobs:    1226144s   20435.73m   340.60h   14.19d  0.039 y
 # IO & Wait Time:                  6875s     114.58m     1.91h    0.08d  0.000 y
 # Average job time:                2527s      42.11m     0.70h    0.03d
 # Longest finished job:            3872s      64.53m     1.08h    0.04d
 # Submission to last job:         11739s     195.65m     3.26h    0.14d
     #	continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-continue=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
 	> cat.log 2>&1 &
     #	real    18m35.814s
     cat fb.mm9.chainGalGal3Link.txt
     #	97711788 bases of 2620346127 (3.729%) in intersection
 
     #	and the swap
     mkdir /cluster/data/galGal3/bed/blastz.mm9.swap
     cd /cluster/data/galGal3/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
 	/cluster/data/mm9/bed/blastzGalGal3.2007-09-21/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=pk  > swap.log 2>&1 &
     #	real    12m54.737s
     cat fb.galGal3.chainMm9Link.txt
     #	84990797 bases of 1042591351 (8.152%) in intersection
 
 #########################################################################
 # BLASTZ Platypus ornAna1 - (DONE - 2007-09-21 - 2007-09-25 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
     cd /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
 
     cat << '_EOF_' > DEF
 # mouse vs. platypus
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=0
 
 # QUERY: ornAna1
 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
 SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
     #	real    912m18.732s
     cat fb.mm9.chainOrnAna1Link.txt
     #	141953739 bases of 2620346127 (5.417%) in intersection
 
     #	and the swap
     mkdir /cluster/data/ornAna1/bed/blastz.mm9.swap
     cd /cluster/data/ornAna1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
 	/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21/DEF \
 	-swap -chainLinearGap=loose -bigClusterHub=kk > swap.log 2>&1 &
     #	real    123m16.632s
     cat fb.ornAna1.chainMm9Link.txt
     #	135570580 bases of 1842236818 (7.359%) in intersection
 
 #########################################################################
 # Blastz Chimp panTro2 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/mm9/bed/blastzPanTro2.2007-09-24
     cd /cluster/data/mm9/bed/blastzPanTro2.2007-09-24
 
     cat << '_EOF_' > DEF
 # Mouse vs Chimp
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_SMSK=/scratch/data/mm9/notInOthers
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/hg/panTro2/nib
 SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
 SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzPanTro2.2007-09-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	DEF > blastz.out 2>&1 &
     #	real    701m23.446s
     cat fb.mm9.chainPanTro2Link.txt
     #	987180081 bases of 2620346127 (37.674%) in intersection
 
     #	and the swap
     mkdir /cluster/data/panTro2/bed/blastz.mm9.swap
     cd /cluster/data/panTro2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPanTro2.2007-09-24/DEF \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap > swap.log 2>&1 &
     #	real    87m25.448s
     cat fb.panTro2.chainMm9Link.txt
     #	997050630 bases of 2909485072 (34.269%) in intersection
 
     #	create syntenic maf nets:
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-syntenicNet -continue=syntenicNet DEF > syntenicNet.out 2>&1 &
     #	real 25m13.118s
 
 #########################################################################
 # Blastz Horse equCab1 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
     ssh kkstore05
     mkdir /cluster/data/mm9/bed/blastzEquCab1.2007-09-24
     cd /cluster/data/mm9/bed/blastzEquCab1.2007-09-24
 
     cat << '_EOF_' > DEF
 # Mouse vs Horse
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse EquCab1
 SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ2_LEN=/cluster/data/equCab1/chrom.sizes       
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzEquCab1.2007-09-24
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	DEF > blastz.out 2>&1 &
     #	real    1582m34.597s
     cat fb.mm9.chainEquCab1Link.txt
     #	911418189 bases of 2620346127 (34.782%) in intersection
 
     #	and the swap
     mkdir /cluster/data/equCab1/bed/blastz.mm9.swap
     cd /cluster/data/equCab1/bed/blastz.mm9.swap
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	/cluster/data/mm9/bed/blastzEquCab1.2007-09-24/DEF \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	> swap.out 2>&1 &
     #	real ~110m
      cat fb.equCab1.chainMm9Link.txt
     #	901367656 bases of 2421923695 (37.217%) in intersection
 
     #	create the syntenic maf nets
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=syntenicNet -syntenicNet DEF > syntenicNet.out 2>&1 &
     #	real 29m40.546s
 
 #########################################################################
 # Blastz Cow bosTau3 (DONE - 2007-09-25 - Hiram)
     ssh kkstore05
     screen # use a screen to control this job
     mkdir /cluster/data/mm9/bed/blastzBosTau3.2007-09-25
     cd /cluster/data/mm9/bed/blastzBosTau3.2007-09-25
 
     cat << '_EOF_' > DEF
 # Mouse vs Cow
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau3
 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
 SEQ2_LEN=/cluster/data/bosTau3/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzBosTau3.2007-09-25
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
 	DEF -bigClusterHub=pk -chainLinearGap=medium > do.log 2>&1 &
     #	real    733m40.065s
     cat fb.mm9.chainBosTau3Link.txt
     #	690515959 bases of 2620346127 (26.352%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/bosTau3/bed/blastz.mm9.swap
     cd /cluster/data/bosTau3/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
 	/cluster/data/mm9/bed/blastzBosTau3.2007-09-25/DEF \
 	-swap -bigClusterHub=pk -chainLinearGap=medium > swap.log 2>&1 &
     #	real    100m20.707s
     cat fb.bosTau3.chainMm9Link.txt
     #	707779988 bases of 2731807384 (25.909%) in intersection
 
     #	create the syntenic maf nets
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
 	-syntenicNet -continue=syntenicNet \
 	DEF -bigClusterHub=pk -chainLinearGap=medium > syntenicNet.log 2>&1 &
     #	real 16m28.741s
 
 #########################################################################
 # Blastz Opossum monDom4 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
     ssh kkstore04
     screen # use screen to manage this job
     mkdir /cluster/data/mm9/bed/blastzMonDom4.2007-09-25
     cd /cluster/data/mm9/bed/blastzMonDom4.2007-09-25
 
     #	the opossum chroms are too large to work with on the kk, must run this
     #	on the pk kluster
     cat << '_EOF_' > DEF
 # Mouse vs. opossum
 
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom4
 SEQ2_DIR=/scratch/hg/monDom4/monDom4.2bit
 SEQ2_LEN=/cluster/data/monDom4/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzMonDom4.2007-09-25
 TMPDIR=/scratch/tmp
 '_EOF'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    811m19.320s
     # problem on kki run, monDom4 wasn't distributed on the Iservers to
     #	/scratch/hg/monDom4/ - straighten that up, and finish that run, then
     #	continuing
     time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
 	-continue=chainMerge -chainLinearGap=loose \
 	-bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
     #	real    158m9.287s
     cat fb.mm9.chainMonDom4Link.txt
     #	255535025 bases of 2620346127 (9.752%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/monDom4/bed/blastz.mm9.swap
     cd /cluster/data/monDom4/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
 	/cluster/data/mm9/bed/blastzMonDom4.2007-09-25/DEF \
 	-swap -chainLinearGap=loose \
 	-bigClusterHub=pk > swap.log 2>&1 &
     #	real    59m19.005s
     cat  fb.monDom4.chainMm9Link.txt
     #	254018516 bases of 3501643220 (7.254%) in intersection
 
 #########################################################################
 # Blastz Tenrec echTel1 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
     ssh kkstore02
     screen # use a screen to control this job
     mkdir /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
     cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
 
     cat << '_EOF_' > DEF
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY - Tenrec echTel1
 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
 SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=800
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzEchTel1.2007-09-25
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-bigClusterHub=kk -chainLinearGap=medium DEF > do.log 2>&1 &
     #	real    2721m33.204s
     cat fb.mm9.chainEchTel1Link.txt
     #	291920039 bases of 2620346127 (11.141%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/echTel1/bed/blastz.mm9.swap
     cd /cluster/data/echTel1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	/cluster/data/mm9/bed/blastzEchTel1.2007-09-25/DEF \
 	-swap -bigClusterHub=kk -chainLinearGap=medium > swap.log 2>&1 &
     #	real    520m9.198s
     cat  fb.echTel1.chainMm9Link.txt
     #	298656963 bases of 2111581369 (14.144%) in intersection
 
     #	create syntenic maf nets
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	DEF -continue=syntenicNet -bigClusterHub=kk \
 	-syntenicNet -chainLinearGap=medium > syntenicNet.log 2>&1 &
     #	real 3m4.285s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 echTel1 \
 	> rbest.log 2>&1 &
     #	real    34m12.936s
 
 #########################################################################
 # Blastz Tree Shrew tupBel1 (DONE - 2007-09-27 - 2007-10-01 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
     cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
 
     cat << '_EOF_' > DEF
 # Mouse vs. Tree Shrew
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY:  Tree shrew tupBel1
 SEQ2_DIR=/san/sanvol1/scratch/tupBel1/tupBel1.2bit
 SEQ2_LEN=/cluster/data/tupBel1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzTupBel1.2007-09-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> chainMerge.log 2>&1 &
     #	real    1262m32.699s
     #	the load should fail due to missing repeat masker tables in tupBel1
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> net.log 2>&1 &
     #	real    69m41.901s
     #	and indeed it did,  Loading the net track
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27/axtChain
     cp -p noClass.net mm9.tupBel1.net
     time nice -n +19 netFilter -minGap=10 mm9.tupBel1.net \
 	| hgLoadNet -warn mm9 netTupBel1 stdin
     cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
     time nice -n +19 featureBits mm9 chainTupBel1Link \
 	> fb.mm9.chainTupBel1Link.txt 2>&1 &
     cat fb.mm9.chainTupBel1Link.txt
     #	552865662 bases of 2620346127 (21.099%) in intersection
 
     #	and, to finish it all off, with syntenic net
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk \
 	-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
     #	real    14m42.816s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 tupBel1 \
 	> rbest.log 2>&1 &
     #	real    41m12.278s
 
 #########################################################################
 # Blastz Bush Baby otoGar1 (DONE - 2007-09-27 - 2007-09-28 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
     cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
 
     cat << '_EOF_' > DEF
 # Mouse vs. Tree Shrew
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY:  Bush baby otoGar1
 SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
 SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> chainMerge.log 2>&1 &
     #	real    873m23.531s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> net.log 2>&1 &
     #	real    67m7.172s
     cat fb.mm9.chainOtoGar1Link.txt
     #	601932945 bases of 2620346127 (22.972%) in intersection
 
     #	and run the syntenicNet and cleanup
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	real 13m57.573s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 otoGar1 \
 	> rbest.log 2>&1 &
     #	real    40m1.428s
 
 #########################################################################
 # Blastz Armadillo dasNov1 (DONE - 2007-09-27 - 2007-10-02 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
     cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
 
     cat << '_EOF_' > DEF
 # Mouse vs. Armadillo
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Armadillo dasNov1
 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
 SEQ2_LEN=/cluster/data/dasNov1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzDasNov1.2007-09-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	real    3607m35.169s
     cat fb.mm9.chainDasNov1Link.txt
     #	433593082 bases of 2620346127 (16.547%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	real    15m7.642s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 dasNov1 \
 	> rbest.log 2>&1 &
     #	real    39m18.156s
 
 #########################################################################
 # Blastz Rabbit oryCun1 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
     cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
 
     cat << '_EOF_' > DEF
 # Mouse vs. Rabbit
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rabbit oryCun1
 SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
 SEQ2_LEN=/cluster/data/oryCun1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzOryCun1.2007-09-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> chainMerge.log 2>&1 &
     #	real    2126m59.162s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	real    53m28.279s
     cat fb.mm9.chainOryCun1Link.txt
     #	496428446 bases of 2620346127 (18.945%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	real 9m27.321s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 oryCun1 \
 	> rbest.log 2>&1 &
     #	real    37m32.151s
 
 #########################################################################
 # Blastz Cat felCat3 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
     cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
 
     cat << '_EOF_' > DEF
 # Mouse vs. Cat
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cat felCat3
 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
 SEQ2_LEN=/cluster/data/felCat3/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzFelCat3.2007-09-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> chainMerge.log 2>&1 &
     #	real    1597m21.032s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	real    39m30.078s
     cat fb.mm9.chainFelCat3Link.txt
     #	499894253 bases of 2620346127 (19.077%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	real 9m42.624s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 felCat3 \
 	> rbest.log 2>&1 &
     #	real    36m40.000s
 
 #########################################################################
 # Blastz Elephant loxAfr1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
     ssh kkstore04
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
     cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
 
     cat << '_EOF_' > DEF
 # Mouse vs. Elephant
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant loxAfr1
 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
 SEQ2_LEN=/cluster/data/loxAfr1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	real    2981m3.302s
     #	had two failed jobs in that state where their results existed,
     #	but parasol thought they were not done.  Continuing, and now
     #	all the way to syntenicNet.  Will probably fail during the load
     #	since not everything is there for db loxAfr1
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=cat -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	real    166m4.710s
     #	it did get through everything to a successful completion
     cat fb.mm9.chainLoxAfr1Link.txt
     #	473014688 bases of 2620346127 (18.052%) in intersection
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 loxAfr1 \
 	> rbest.log 2>&1 &
     #	real    41m56.201s
 
 #########################################################################
 # Blastz Hedgehog eriEur1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
     cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
 
     cat << '_EOF_' > DEF
 # Mouse vs. Hedgehog
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Hedgehog eriEur1
 SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
 SEQ2_LEN=/cluster/data/eriEur1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzEriEur1.2007-09-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	failed during the load since the db eriEur1 does not exist
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28/axtChain
     cp -p noClass.net mm9.eriEur1.net
     time nice -n +19 netFilter -minGap=10 mm9.eriEur1.net \
 	| hgLoadNet -warn mm9 netEriEur1 stdin
     cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
     time nice -n +19 featureBits mm9 chainEriEur1Link \
 	> fb.mm9.chainEriEur1Link.txt 2>&1 &
     cat fb.mm9.chainEriEur1Link.txt
     #	262604655 bases of 2620346127 (10.022%) in intersection
 
     # continuing through syntenic nets (actually unneeded)
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
 	-syntenicNet > syntenicNet.log 2>&1 &
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 eriEur1 \
 	> rbest.log 2>&1 &
     #	real    33m27.296s
 
 #########################################################################
 # Blastz Shrew sorAra1 (DONE - 2007-09-28 - 2007-10-01 - Hiram)
     ssh kkstore05
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
     cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
 
     cat << '_EOF_' > DEF
 # Mouse vs. Shrew
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Shrew sorAra1
 SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
 SEQ2_LEN=/cluster/data/sorAra1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzSorAra1.2007-09-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
 	>chainMerge chainMerge.log 2>&1 &
     #	real    2478m57.242s
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
 	> load.log 2>&1 &
     #	real    15m55.272s
     #	as expected, fails during load since there is no sorAra1 database
     #	load nets without class
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28/axtChain
     cp -p noClass.net mm9.sorAra1.net
     time nice -n +19 netFilter -minGap=10 mm9.sorAra1.net \
 	| hgLoadNet -warn mm9 netSorAra1 stdin
     cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
     time nice -n +19 featureBits mm9 chainSorAra1Link \
 	> fb.mm9.chainSorAra1Link.txt 2>&1
     cat fb.mm9.chainSorAra1Link.txt
     #	250412778 bases of 2620346127 (9.556%) in intersection
 
     #	and, to finish it all off, with syntenic net
     time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
 	-continue=download -bigClusterHub=pk \
 	-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
     #	real    3m49.961s
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
     time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 sorAra1 \
 	> rbest.log 2>&1 &
     #	real    27m3.076s
 
 #########################################################################
 ## 30-Way Multiz (WORKING - 2007-10-01 - Hiram)
 ##	The blastz alignments for this 30-way are documented at:
 ##	http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 ##
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/multiz30way
     cd /cluster/data/mm9/bed/multiz30way
     #	take the 28-way tree from hg18 and insert the two new genomes.
     #	rearrange to get mm9 on the top of the graph
     #	paste this tree into the on-line phyloGif tool:
     #	http://genome.ucsc.edu/cgi-bin/phyloGif
     #	to create the image for the tree diagram
 
     cat << '_EOF_' > mm9OnTop.fullNames.nh
 ((((((((
 
  (((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,
     GuineaPig_cavPor2:0.202990):0.034350,
         Rabbit_oryCun1:0.208548):0.014587,
 
 ((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
    Orangutan_ponAbe2:0.02):0.013037,Rhesus_rheMac2:0.031973):0.0365,
         Marmoset_calJac1:0.07):0.0365,Bushbaby_otoGar1:0.151185):0.015682,
            TreeShrew_tupBel1:0.162844):0.006272):0.019763,
 
  ((Shrew_sorAra1:0.248532,Hedgehog_eriEur1:0.222255):0.045693,
 
  (((Dog_canFam2:0.101137,Cat_felCat3:0.098203):0.048213,
     Horse_equCab1:0.099323):0.007287,
         Cow_bosTau3:0.163945):0.012398):0.018928):0.030081,
 
  (Armadillo_dasNov1:0.133274,(Elephant_loxAfr1:0.103030,
         Tenrec_echTel1:0.232706):0.049511):0.008424):0.213469,
 
  Opossum_monDom4:0.320721):0.088647,
     Platypus_ornAna1:0.488110):0.118797,
         (Chicken_galGal3:0.395136,Lizard_anoCar1:0.513962):0.093688):0.151358,
             Frog_xenTro2:0.778272):0.174596,
 
  (((Tetraodon_tetNig1:0.203933,Fugu_fr2:0.239587):0.203949,
     (Stickleback_gasAcu1:0.314162,Medaka_oryLat1:0.501915):0.055354):0.346008,
 Zebrafish_danRer5:0.730028):0.174596);
 '_EOF_'
     # << happy emacs
     
     #	create a species list from that file:
     sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' mm9OnTop.fullNames.nh \
         | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
         | sed -e "s/.*_//; s/:.*//" | sort > species.list
     #	verify that has 30 db names in it
     # create a stripped down nh file for use in autoMZ run
     echo \
 `sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' mm9OnTop.fullNames.nh \
 	| sed -e "s/  / /g"` > tree.30.nh
     #	that looks like, as a single line:
 (((((((( (((mm9 rn4) cavPor2) oryCun1) ((((((hg18 panTro2) ponAbe2) rheMac2)
 calJac1) otoGar1) tupBel1)) ((sorAra1 eriEur1) (((canFam2 felCat3) equCab1)
 bosTau3))) (dasNov1 (loxAfr1 echTel1))) monDom4) ornAna1) (galGal3 anoCar1))
 xenTro2) (((tetNig1 fr2) (gasAcu1 oryLat1)) danRer5))
 
     # verify all blastz's exists
     cat << '_EOF_' > listMafs.csh
 #!/bin/csh -fe
 cd /cluster/data/mm9/bed/multiz30way
 foreach db (`cat species.list`)
     set bdir = /cluster/data/mm9/bed/blastz.$db
     if (-e $bdir/mafRBestNet/chr1.maf.gz) then
 	echo "$db mafRBestNet"
     else if (-e $bdir/mafSynNet/chr1.maf.gz) then
 	echo "$db mafSynNet"
     else if (-e $bdir/mafNet/chr1.maf.gz) then
 	echo "$db mafNet"
     else
 	echo "$db mafs not found"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./listMafs.csh
     #	see what it says, shouldn't be anything with "mafs not found"
     ./listMafs.csh
 
     # copy net mafs to cluster-friendly storage, splitting chroms
     # into 50MB chunks  to improve run-time
     # NOTE: splitting will be different for scaffold-based reference asemblies
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/multiz30way/run.split
     cd /cluster/data/mm9/bed/multiz30way/run.split
     #	this works by examining the rmsk table for likely repeat areas
     #	that won't be used in blastz
     mafSplitPos mm9 50 mafSplit.bed
 
     ssh kki
     cd /cluster/data/mm9/bed/multiz30way/run.split
  
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
 set db = $1
 set sdir = /san/sanvol1/scratch/mm9/splitStrictMafNet
 mkdir -p $sdir
 if (-e $sdir/$db) then
     echo "directory $sdir/$db already exists -- remove and retry"
     exit 1
 endif
 set bdir = /cluster/data/mm9/bed/blastz.$db
 if (! -e $bdir) then
     echo "directory $bdir not found"
     exit 1
 endif
 mkdir -p $sdir/$db
 if (-e $bdir/mafRBestNet) then
     set mdir = $bdir/mafRBestNet
 else if (-e $bdir/mafSynNet) then
     set mdir = $bdir/mafSynNet
 else if (-e $bdir/mafNet) then
     set mdir = $bdir/mafNet
 else
     echo "$bdir maf dir not found"
     exit 1
 endif
 echo $mdir
 foreach f ($mdir/*)
     set c = $f:t:r:r
     echo "  $c"
     nice mafSplit mafSplit.bed $sdir/$db/ $f
 end
 echo "gzipping $sdir/$db mafs"
 nice gzip $sdir/$db/*
 endif
 echo $mdir > $db.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     grep -v mm9  ../species.list > split.list
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(path1) {check out line+ $(path1).done}
 #ENDLOOP
 '_EOF_'
     gensub2 split.list single template jobList
     para create jobList
     # 29 jobs
     # start these gently, this is a good load on the san filesystem
     para try
     # let that run to a couple completions, a few minutes, then again:
     para try
     # etc ...
 # Completed: 29 of 29 jobs
 # CPU time in finished jobs:       9476s     157.94m     2.63h    0.11d  0.000 y
 # IO & Wait Time:                  1531s      25.51m     0.43h    0.02d  0.000 y
 # Average job time:                 380s       6.33m     0.11h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1081s      18.02m     0.30h    0.01d
 # Submission to last job:          1391s      23.18m     0.39h    0.02d
 
     # ready for the multiz run
     ssh pk
     cd /cluster/data/mm9/bed/multiz30way
     #	actually, the result directory here should be maf.split instead of maf
     mkdir -p maf run
     cd run
     mkdir penn
     # use latest penn utilities
     P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $P/{autoMZ,multiz,maf_project} penn
 
     # list chrom chunks, any db dir will do; better would be for the
     # splitter to generate this file
     # We temporarily use __ instead of . to delimit chunk in filename
     # so we can use $(root) to get basename
     find /san/sanvol1/scratch/mm9/splitStrictMafNet -type f \
 	| while read F; do basename $F; done \
 	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.lst
 	sort -u > chromChunks.list
     wc -l chromChunks.list
         # 75
 
 cat > autoMultiz.csh << '_EOF_'
 #!/bin/csh -ef
 
     set db = mm9
     set c = $1
     set maf = $2
     set run = `pwd`
     set tmp = /scratch/tmp/$db/multiz.$c
     set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
     rm -fr $tmp
     mkdir -p $tmp
     cp ../tree.30.nh ../species.list $tmp
     pushd $tmp
     foreach s (`cat species.list`)
         set c2 = `echo $c | sed 's/__/./'`
         set in = $pairs/$s/$c2.maf
         set out = $db.$s.sing.maf
         if ($s == mm9) then
             continue
         endif
         if (-e $in.gz) then
             zcat $in.gz > $out
         else if (-e $in) then
             cp $in $out
         else
             echo "##maf version=1 scoring=autoMZ" > $out
         endif
     end
     set path = ($run/penn $path); rehash
     $run/penn/autoMZ + T=$tmp E=$db "`cat tree.30.nh`" $db.*.sing.maf $c.maf
     popd
     cp $tmp/$c.maf $maf
     rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz.csh
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 chromChunks.list single template jobList
     para create jobList
     # 75 jobs
     #	three of these jobs failed with memory allocation error:
 # maf_project.v12: Ran out of memory trying to allocate 64.
 # autoMZ.v1: command 'maf_project /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_lef
 # t.maf19 mm9 > /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_U1' failed
     # the 73 jobs run time:
 # Completed: 72 of 75 jobs
 # CPU time in finished jobs:     501143s    8352.38m   139.21h    5.80d  0.016 y
 # IO & Wait Time:                 22628s     377.14m     6.29h    0.26d  0.001 y
 # Average job time:                7275s     121.24m     2.02h    0.08d
 # Longest finished job:           15957s     265.95m     4.43h    0.18d
 # Submission to last job:         16473s     274.55m     4.58h    0.19d
     #	performed a para recover on the jobList and used the kki kluster
     #	to run the last three jobs:
 # Completed: 3 of 3 jobs
 # CPU time in finished jobs:      50762s     846.03m    14.10h    0.59d  0.002 y
 # IO & Wait Time:                  1795s      29.92m     0.50h    0.02d  0.000 y
 # Average job time:               17519s     291.98m     4.87h    0.20d
 # Longest finished job:           17887s     298.12m     4.97h    0.21d
 # Submission to last job:         17887s     298.12m     4.97h    0.21d
 
     # put the split maf results back together into single chroms
     ssh kkstore06
     cd /cluster/data/mm9/bed/multiz30way
     # here is where the result directory maf should have already been maf.split
     mv maf maf.split
     mkdir maf
     # going to sort out the redundant header garbage to leave a cleaner maf
     for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
 do
     echo ${C}
     head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
     grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
 	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
     grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
     tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
 done
 
     # load tables for a look
     ssh hgwdev
     mkdir -p /gbdb/mm9/multiz30way/maf
     ln -s /cluster/data/mm9/bed/multiz30way/maf/*.maf \
                 /gbdb/mm9/multiz30way/maf
     cd /cluster/data/mm9/bed/multiz30way
     # this generates a large 1 Gb multiz30way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/mm9/multiz30way/maf mm9 multiz30way
     #	real    11m38.695s
     #	Loaded 15881850 mafs in 34 files from /gbdb/mm9/multiz30way/maf
 
     # load summary table
     time nice -n +19 cat /gbdb/mm9/multiz30way/maf/*.maf \
 	| hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
 	 -maxSize=200000  multiz30waySummary stdin
     #	Created 5648546 summary blocks from 154642836 components and 15872991
     #	mafs from stdin
     #	real    19m44.355s
 
     # Gap Annotation
     # prepare bed files with gap info
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/multiz30way/anno
     cd /cluster/data/mm9/bed/multiz30way/anno
     mkdir maf run
 
     for DB in `cat ../species.list`
 do
     CDIR="/cluster/data/${DB}"
     if [ ! -f ${CDIR}/${DB}.N.bed ]; then
 	echo "creating ${DB}.N.bed"
 	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
     else
 	ls -og ${CDIR}/${DB}.N.bed
     fi
 done
 
     cd run
     rm -f nBeds sizes
     for DB in `grep -v mm9 ../../species.list`
 do
     echo "${DB} "
     ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
     echo ${DB}.bed  >> nBeds
     ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
     echo ${DB}.len  >> sizes
 done
 
     ssh kki
     cd /cluster/data/mm9/bed/multiz30way/anno/run
 
     cat << '_EOF_' > doAnno.csh
 #!/bin/csh -ef
     set dir = /cluster/data/mm9/bed/multiz30way
     set c = $1
     cat $dir/maf/${c}.maf | \
         nice mafAddIRows -nBeds=nBeds stdin /cluster/data/mm9/mm9.2bit $2
 '_EOF_'
     # << happy emacs
     chmod +x doAnno.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./doAnno.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/anno/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	there is no 16_random maf file
     cut -f1 /cluster/data/mm9/chrom.sizes | grep -v 16_random > chrom.list
     gensub2 chrom.list single template jobList
     para create jobList
     para try
 #	Crashed: 1 jobs
 # CPU time in finished jobs:      18129s     302.15m     5.04h    0.21d  0.001 y
 # IO & Wait Time:                 10273s     171.22m     2.85h    0.12d  0.000 y
 # Average job time:                 861s      14.34m     0.24h    0.01d
 # Longest finished job:            4376s      72.93m     1.22h    0.05d
     #	one job was too large for this memory:
     # job: ./doAnno.csh chr1 /cluster/data/mm9/bed/multiz30way/anno/maf/chr1.maf
     # needLargeMem: Out of memory - request size 1129396 bytes, errno: 12
     #	going to hgwdev for this one:
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/anno/run
     time ./doAnno.csh chr1 ../maf/chr1.maf
     #	real    17m34.550s
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/anno
     mkdir -p /gbdb/mm9/multiz30way/anno/maf
     ln -s /cluster/data/mm9/bed/multiz30way/anno/maf/*.maf \
                 /gbdb/mm9/multiz30way/anno/maf
     #	by loading this into the table multiz30way, it will replace the
     #	previously loaded table with the unannotated mafs
     #	huge temp files are made, do them on local disk
     cd /scratch/tmp
     time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm9/multiz30way/anno/maf \
                 mm9 multiz30way
     #	Loaded 16799995 mafs in 34 files from /gbdb/mm9/multiz30way/anno/maf
     #	real    18m12.171s
 
     #	This step may be useless.  The original mafs should have the same
     #	summary.
     cat /cluster/data/mm9/chrom.sizes | \
 	awk '{if ($2 > 1000000) { print $1 }}' |
 	while read C
 do
     echo /gbdb/mm9/multiz30way/anno/maf/$C.maf
 done | xargs cat | \
         hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
             -maxSize=200000  multiz30waySummary stdin
     #	Created 5648546 summary blocks from 154642836 components and 16790208
     #	mafs from stdin
     #	by loading this into the table multiz30waySummary, it will replace
     #	the previously loaded table with the unannotated mafs
     #	real    30m26.542s
 
 #############################################################################
 ## Annotate 30-way multiple alignment with gene annotations
 ##		(WORKING - 2007-10-18 - Hiram)
     # Gene frames
     ## survey all genomes to see what type of gene track to use
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/multiz30way/frames
     cd /cluster/data/mm9/bed/multiz30way/frames
     #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
     cat << '_EOF_' > showGenes.csh
 #!/bin/csh -fe
 foreach db (`egrep -v "sorAra1|eriEur1|cavPor2"  ../species.list`)
     echo -n "${db}: "
     echo -n "Tables: "
     set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
     foreach table ($tables)
 	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
 	    $table == "knownGene") then
 		set count = `hgsql $db -N -e "select count(*) from $table"`
 		echo -n "${table}: ${count}, "
 	endif
     end
     set orgName = `hgsql hgcentraltest -N -e \
 	    "select scientificName from dbDb where name='$db'"`
     set orgId = `hgsql mm9 -N -e \
 	    "select id from organism where name='$orgName'"`
     if ($orgId == "") then
 	echo "Mrnas: 0"
     else
 	set count = `hgsql mm9 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
 	echo "Mrnas: ${count}"
     endif
 end
 '_EOF_'
     # << happy emacs
     chmod +x ./showGenes.csh
     #	given this output, manually sorted for this display:
 # hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29028, refGene: 25902, Mrnas: 208990
 # mm9: Tables: knownGene: 49409, mgcGenes: 22947, refGene: 21004, Mrnas: 5092390
 # rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5400, refGene: 14333, Mrnas: 34471
 # canFam2: Tables: ensGene: 25568, refGene: 833, Mrnas: 1708
 # danRer5: Tables: ensGene: 31740, mgcGenes: 13037, refGene: 12879, Mrnas: 33184
 # fr2: Tables: ensGene: 22102, Mrnas: 1098
 # gasAcu1: Tables: ensGene: 28840, Mrnas: 2326
 # monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
 # ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
 # oryLat1: Tables: ensGene: 23087, Mrnas: 980
 # panTro2: Tables: ensGene: 32852, refGene: 26160, Mrnas: 1277
 # rheMac2: Tables: ensGene: 38561, refGene: 412, Mrnas: 3169
 # bosTau3: Tables: mgcGenes: 9617, refGene: 10287, Mrnas: 26808
 # equCab1: Tables: refGene: 304, Mrnas: 1396
 # felCat3: Tables: refGene: 401, Mrnas: 882
 # galGal3: Tables: refGene: 4210, Mrnas: 31217
 # xenTro2: Tables: mgcGenes: 6255, refGene: 7086, Mrnas: 19155
 # anoCar1: Tables: Mrnas: 12
 # calJac1: Tables: Mrnas: 949
 # dasNov1: Tables: Mrnas: 18
 # echTel1: Tables: Mrnas: 0
 # loxAfr1: Tables: Mrnas: 12
 # oryCun1: Tables: Mrnas: 3786
 # otoGar1: Tables: Mrnas: 0
 # ponAbe2: Tables: Mrnas: 2
 # tetNig1: Tables: Mrnas: 99495
 # tupBel1: Tables: Mrnas: 47
 
     #	use knownGene for hg18, mm9
     #	use ensGene for rn4, canFam2, danRer5, fr2, gasAcu1, monDom4, ornAna1,
     #		oryLat1, panTro2, rheMac2
     #	use refGene for bosTau3, xenTro2
     #	use Mrnas for galGal3, tetNig1
     #	barely can use Mrnas for equCab1, felCat3, anoCar1, dasNov1,
     #	loxAfr1, oryCun1, ponAbe2
     #	no annotations for calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
     #		tupBel1
 
     mkdir genes
     # knownGene
     for DB in hg18 mm9
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # ensGene
     for DB in rn4 canFam2 danRer5 fr2 gasAcu1 monDom4 \
 	ornAna1 oryLat1 panTro2 rheMac2
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # refGene
     for DB in bosTau3 xenTro2
 do
     hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/${DB}.tmp.gz
     mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
     echo "${DB} done"
 done
     # and finally, using the mrna tables
 
     #	use Mrnas for galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1
     #	loxAfr1 oryCun1 ponAbe2
     for DB in galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1 \
 	loxAfr1 oryCun1 ponAbe2
 do
 tmpExt=`mktemp temp.XXXXXX`
 tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
 tmpMrna=${DB}.mrna.${tmpExt}
 tmpCds=${DB}.cds.${tmpExt}
 hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
 	   from all_mrna,gbCdnaInfo,cds \
 	   where (all_mrna.qName = gbCdnaInfo.acc) and \
 	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
 $DB > ${tmpMrnaCds}
 cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
 cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
 mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
 genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
 rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
 mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
 rm -f $tmpExt
 echo "${DB} done"
 done
 
     ssh kkstore06
     cd /cluster/data/mm9/bed/multiz30way/frames
     # leaving out calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
     #		tupBel1 since no gene preds there
     time (cat  ../maf/*.maf | nice -n +19 genePredToMafFrames mm9 stdin stdout rn4 genes/rn4.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz equCab1 genes/equCab1.gp.gz dasNov1 genes/dasNov1.gp.gz oryCun1 genes/oryCun1.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz loxAfr1 genes/loxAfr1.gp.gz bosTau3 genes/bosTau3.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz fr2 genes/fr2.gp.gz oryLat1 genes/oryLat1.gp.gz | gzip > multiz30way.mafFrames.gz) > frames.log 2>&1
     # see what it looks like in terms of number of annotations per DB:
     zcat multiz30way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
      67 loxAfr1
      79 dasNov1
     116 ponAbe2
     491 anoCar1
    1807 tetNig1
    2429 felCat3
    4892 equCab1
    9156 oryCun1
   85568 bosTau3
  118192 galGal3
  129442 xenTro2
  208239 rn4
  224420 rheMac2
  226866 panTro2
  228563 hg18
  243074 canFam2
  329523 danRer5
  334418 ornAna1
  347708 oryLat1
  369267 monDom4
  374016 gasAcu1
  380839 fr2
 
     #	load the resulting file
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/frames
     time nice -n +19 hgLoadMafFrames mm9 multiz30wayFrames \
 	multiz30way.mafFrames.gz
     #	real    1m1.893s
 
     #	enable the trackDb entries:
 # frames multiz30wayFrames
 # irows on
 
 #############################################################################
 # phastCons 30-way (WORKING - 2007-10-16 - Hiram)
 
     # split 30way mafs into 10M chunks and generate sufficient statistics 
     # files for # phastCons
     ssh kki
     mkdir /cluster/data/mm9/bed/multiz30way/msa.split
     cd /cluster/data/mm9/bed/multiz30way/msa.split
     mkdir -p /san/sanvol1/scratch/mm9/multiz30way/cons/ss
 
     cat << '_EOF_' > doSplit.csh
 #!/bin/csh -ef
     set MAFS = /cluster/data/mm9/bed/multiz30way/maf
     set WINDOWS = /san/sanvol1/scratch/mm9/multiz30way/cons/ss
     pushd $WINDOWS
     set c = $1
     rm -fr $c
     mkdir $c
     twoBitToFa -seq=$c /scratch/data/mm9/mm9.2bit /scratch/tmp/mm9.$c.fa
     # need to truncate odd-ball scaffold/chrom names that include dots
     # as phastCons utils can't handle them
     set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
     perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
     /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
         -M /scratch/tmp/mm9.$c.fa \
         -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
     rm -f $CLEAN_MAF /scratch/tmp/mm9.$c.fa
     popd
     date >> $c.done
 '_EOF_'
     # << happy emacs
     chmod +x doSplit.csh
 
     cat << '_EOF_' > template
 #LOOP
 doSplit.csh $(root1) {check out line+ $(root1).done}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     #	do the easy ones first to see some immediate results
     ls -1S -r ../maf | sed -e "s/.maf//" > maf.list
 
     gensub2 maf.list single template jobList
     para create jobList
     para try ... check ... etc
 -
     # completed shorter jobs in a few hours, there is a problem of swapping
     # going on here, two of these jobs on a single node can consume all of its
     # memory and then some.  Three jobs failed to complete, finish them up
     # manually on hgwdev, the processes grow to over 8 Gb in memory for chr1,
     # chr11 and chr2
 
     # Estimate phastCons parameters
 
     time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
 /san/sanvol1/scratch/mm9/multiz30way/cons/ss/chrY/chrY.1-10000000.ss \
 --tree "(((((((((((mm9,rn4),cavPor2),oryCun1),((((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1),tupBel1)),((sorAra1,eriEur1),(((canFam2,felCat3),equCab1),bosTau3))),(dasNov1,(loxAfr1,echTel1))),monDom4),ornAna1),(galGal3,anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer5))" \
     --out-root starting-tree
     #	real    107m46.703s
     #	Tried this on chr13 too:
     #	real    4619m42.984s
     #	that is almost 77 hours on hgwdev == 3.2 days
 
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.400
     #	This 0.400 is used in the --gc argument below
     #	got 0.404 with chrM.starting-tree.mod
 
     # Run phastCons
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     ssh pk
     mkdir -p /cluster/data/mm9/bed/multiz30way/cons/run.cons
     cd /cluster/data/mm9/bed/multiz30way/cons/run.cons
 
     #	there are going to be several different phastCons runs using
     #	this same script.  They trigger off of the current working directory
     #	$cwd:t which is the "grp" in this script.  It is one of:
     #	all gliers placentals
 
     cat << '_EOF_' > doPhast.csh
 #!/bin/csh -fe
 set PHASTBIN = /cluster/bin/phast.2007-05-04
 set c = $1
 set f = $2
 set len = $3
 set cov = $4
 set rho = $5
 set grp = $cwd:t
 set tmp = /scratch/tmp/$f
 set cons = /cluster/data/mm9/bed/multiz30way/cons
 mkdir -p $tmp
 set san = /san/sanvol1/scratch/mm9/multiz30way/cons
 if (-s $cons/$grp/$grp.non-inf) then
   cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
   cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
 else
   cp -p $cons/$grp/$grp.mod .
   cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
 endif
 pushd $tmp > /dev/null
 if (-s $grp.non-inf) then
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --not-informative `cat $grp.non-inf` \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 else
   $PHASTBIN/phastCons $f.ss $grp.mod \
     --rho $rho --expected-length $len --target-coverage $cov --quiet \
     --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
 endif
 popd > /dev/null
 mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
 sleep 4
 touch $san/$grp/pp/$c $san/$grp/bed/$c
 rm -f $san/$grp/pp/$c/$f.pp
 rm -f $san/$grp/bed/$c/$f.bed
 mv $tmp/$f.pp $san/$grp/pp/$c
 mv $tmp/$f.bed $san/$grp/bed/$c
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast.csh
 
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     # Create parasol batch and run it
     pushd /san/sanvol1/scratch/mm9/multiz30way/cons
     ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
         /cluster/data/mm9/bed/multiz30way/cons/run.cons/ss.list
     popd
 
     # run for all species
     cd ..
     mkdir -p all run.cons/all
     cd all
     cp ../../chrY.starting-tree.mod all.mod
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
 # Completed: 294 of 294 jobs
 # CPU time in finished jobs:      25724s     428.73m     7.15h    0.30d  0.001 y
 # IO & Wait Time:                  8951s     149.19m     2.49h    0.10d  0.000 y
 # Average job time:                 118s       1.97m     0.03h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             226s       3.77m     0.06h    0.00d
 # Submission to last job:           582s       9.70m     0.16h    0.01d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/all
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/all
     time nice -n +19 hgLoadBed mm9 phastConsElements30way mostConserved.bed
     #	Loaded 2782368 elements of size 5
     #	real    1m15.673s
     # compare with previous tracks
     hgsql mm9 -s -N -e "select count(*) from phastConsElements30way"
     #	2782368
     hgsql mm8 -s -N -e "select count(*) from phastConsElements17way"
     #	1883370
 
     # Try for 5% overall cov, and 70% CDS cov 
     #	--rho .31 --expected-length 45 --target-coverage .3
     #	chrY mod tree
     featureBits mm9 -enrichment refGene:cds phastConsElements30way
     #	refGene:cds 1.167%, phastConsElements30way 4.789%,
     #	both 0.582%, cover 49.90%, enrich 10.42x
     featureBits mm9 -enrichment knownGene:cds phastConsElements30way
     #	knownGene:cds 1.278%, phastConsElements30way 4.789%,
     #	both 0.627%, cover 49.03%, enrich 10.24x
     #	--rho .31 --expected-length 45 --target-coverage .3 elim non-autho
     #	chr13 mod tree
     featureBits mm9 -enrichment refGene:cds mostConserved.bed
     #	refGene:cds 1.167%, mostConserved.bed 4.128%,
     #	both 0.614%, cover 52.59%, enrich 12.74x
     #	--rho .31 --expected-length 45 --target-coverage .3 elim non-autho
     #	28-way mod tree adjusted to 30-way
     featureBits mm9 -enrichment refGene:cds mostConserved.bed
     #	refGene:cds 1.167%, mostConserved.bed 5.841%, both 0.862%, cover
     #	73.90%, enrich 12.65x
 
     featureBits mm8 -enrichment refGene:cds phastConsElements17way
     #	refGene:cds 1.188%, phastConsElements17way 5.398%,
     #	both 0.832%, cover 70.05%, enrich 12.98x
     featureBits mm8 -enrichment knownGene:cds phastConsElements17way
     #	knownGene:cds 1.109%, phastConsElements17way 5.398%,
     #	both 0.776%, cover 69.99%, enrich 12.97x
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons30wayScores
 
 for D in pp/chr*
 do
     C=${D/pp\/}
     out=phastCons30wayScores/${C}.data.gz
     echo "${D} > ${C}.data.gz"
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
 	gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
 
     # Create merged posterier probability file and wiggle track data files
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     for D in pp/chr*
 do
     ls $D/*.pp | sort -n -t\. -k2
 done | xargs cat \
 	| wigEncode -noOverlap stdin phastCons30way.wig phastCons30way.wib
 # Converted stdin, upper limit 1.00, lower limit 0.00
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/all
     ln -s `pwd`/phastCons30way.wib /gbdb/mm9/multiz30way/phastCons30way.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
 	phastCons30way phastCons30way.wig
     #	real    0m42.728s
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=mm9 phastCons30way > histogram.data 2>&1
     #	real    28m24.388s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Mm9 Histogram phastCons30way track"
 set xlabel " phastCons30way score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ### Create a phastCons data set for Euarchontoglires
 
     # setup euarchontoglires-only run
     ssh pk
     cd /cluster/data/mm9/bed/multiz30way/cons
     mkdir euarchontoglires run.cons/euarchontoglires
     cd euarchontoglires
     # euarchontoglires-only: exclude all but these for phastCons tree:
     /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
 	--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1 \
 	> euarchontoglires.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
         > euarchontoglires.non-inf
 
     cd ../run.cons/euarchontoglires
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
     #	Three of these jobs fail to produce any output:
     #	chr5_random/chr5_random.1-357350.bed
     #	chr7_random/chr7_random.1-362490.bed
     #	chrY_random/chrY_random.50000001-58682461.bed
 # Completed: 291 of 294 jobs
 # Crashed: 3 jobs
 # CPU time in finished jobs:      17184s     286.40m     4.77h    0.20d  0.001 y
 # IO & Wait Time:                 30139s     502.31m     8.37h    0.35d  0.001 y
 # Average job time:                 163s       2.71m     0.05h    0.00d
 # Longest finished job:             296s       4.93m     0.08h    0.00d
 # Submission to last job:          2775s      46.25m     0.77h    0.03d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
     time nice -n +19 hgLoadBed mm9 phastConsElements30wayEuarch \
 	mostConserved.bed
     #	Loaded 1021674 elements of size 5
     #	real    0m23.402s
     # verify coverage
     featureBits mm9 phastConsElements30wayEuarch
     #	103492546 bases of 2620346127 (3.950%) in intersection
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
     mkdir downloads
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.euarchontoglires.pp.data.gz
     echo $D $C done
 done
 
     # Create merged posterier probability file and wiggle track data files
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
     ls downloads/chr*.data.gz | xargs zcat \
  | wigEncode -noOverlap stdin phastCons30wayEuarch.wig phastCons30wayEuarch.wib
 # Converted stdin, upper limit 1.00, lower limit 0.00
 
     ## load table with wiggle data
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
     cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/*.wi? .
     ln -s `pwd`/phastCons30wayEuarch.wib \
 	/gbdb/mm9/multiz30way/phastCons30wayEuarch.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
 	phastCons30wayEuarch phastCons30wayEuarch.wig
     #	real    0m44.161s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=mm9 phastCons30wayEuarch > histogram.data 2>&1
     #	real    3m22.364s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Mm9 Histogram phastCons30wayEuarch track"
 set xlabel " phastCons30wayEuarch score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
     ### Create a phastCons data set for Placentals
     # setup placental-only run
     ssh pk
     cd /cluster/data/mm9/bed/multiz30way/cons
     mkdir placental run.cons/placental
     cd placental
     # placental-only: exclude all but these for phastCons tree:
     /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
 	--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1,sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1 \
 	> placental.mod
     #	and place the removed ones in the non-inf file so phastCons will
     #	truly ignore them:
     echo "monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
         > placental.non-inf
 
     cd ../run.cons/placental
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create template file for "all" run
     cat << '_EOF_' > template
 #LOOP
 ../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/placental/bed/$(root1)/$(file1).bed}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 ../ss.list single template jobList
     para create jobList
     para try ... check ... push ... etc.
     #	One of these jobs fails to produce any output:
     #	chr5_random/chr5_random.1-357350.bed
 # Completed: 293 of 294 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      21121s     352.01m     5.87h    0.24d  0.001 y
 # IO & Wait Time:                 33985s     566.42m     9.44h    0.39d  0.001 y
 # Average job time:                 188s       3.13m     0.05h    0.00d
 # Longest finished job:             324s       5.40m     0.09h    0.00d
 # Submission to last job:          3511s      58.52m     0.98h    0.04d
 
     # create Most Conserved track
     ssh kolossus
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
     cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
         awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
             /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/placental
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/placental
     time nice -n +19 hgLoadBed mm9 phastConsElements30wayPlacental \
 	mostConserved.bed
     #	Loaded 1990870 elements of size 5
     #	real    0m48.084s
     # verify coverage
     featureBits mm9 phastConsElements30wayPlacental
     #	111626429 bases of 2620346127 (4.260%) in intersection
 
     #	Create the downloads .pp files, from which the phastCons wiggle data
     #	is calculated
     # currently doesn't matter where this is performed, the san is the same
     # network distance from all machines.
     # sort by chromName, chromStart so that items are in numerical order 
     #  for wigEncode
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
     mkdir downloads
     for D in pp/chr*
 do
     C=${D/pp\//}
     ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
 	> downloads/${C}.placental.pp.data.gz
     echo $D $C done
 done
 
     # Create merged posterier probability file and wiggle track data files
     cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
     ls downloads/chr*.data.gz | xargs zcat \
  | wigEncode -noOverlap stdin phastCons30wayPlacental.wig \
 	phastCons30wayPlacental.wib
 # Converted stdin, upper limit 1.00, lower limit 0.00
 
     ## load table with wiggle data
     ssh hgwdev
     cd /cluster/data/mm9/bed/multiz30way/cons/placental
     cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/placental/*.wi? .
     ln -s `pwd`/phastCons30wayPlacental.wib \
 	/gbdb/mm9/multiz30way/phastCons30wayPlacental.wib
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
 	phastCons30wayPlacental phastCons30wayPlacental.wig
     #	real    0m44.585s
 
     #  Create histogram to get an overview of all the data
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=mm9 phastCons30wayPlacental > histogram.data 2>&1
     #	real    28m24.388s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Mm9 Histogram phastCons30wayPlacental track"
 set xlabel " phastCons30wayPlacental score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 #############################################################################
 ## Downloads for 30way Conservation (DONE - 2007-11-01 - Hiram)
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
     cd /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
     mkdir placental euarchontoglires all
     cd all
     cp -p \
 /san/sanvol1/scratch/mm9/multiz30way/cons/all/phastCons30wayScores/*.data.gz .
     cd ../placental
     cp -p \
 /san/sanvol1/scratch/mm9/multiz30way/cons/placental/downloads/*.data.gz .
     cd ../euarchontoglires
     cp -p \
 /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/downloads/*.data.gz .
 
     #	rebuilt 2007-12-27 to fix difficulty in mafFrags when species.lst
     #	did not have mm9 as the first one
     # upstream mafs (mafFrags takes a while)
     ssh kkstore06
     cd /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
     # bash script
 #!/bin/sh
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits mm9 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm9 multiz30way \
                 stdin stdout \
                 -orgs=/cluster/data/mm9/bed/multiz30way/species.list \
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 
     md5sum up*.gz >> md5sum.txt
 
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf_qual
     cp -p ../../../qual/maf/*.maf .
     time nice -n +19 gzip *.maf
     #	real    77m3.592s
     time nice -n +19 md5sum *.gz > md5sum.txt
     #	real    4m52.044s
 
     mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
     cp -p ../../../anno/maf/*.maf .
     time nice -n +19 gzip *.maf
     #	real    86m2.341s
     time nice -n +19 md5sum *.gz > md5sum.txt
     #	real    4m30.087s
 
     #	create syn.net files for downloads for those organisms which
     #	used the mafSynNet in the multiz30way
     ssh kkstore06
     cd /cluster/data/mm9/bed
     for DB in rn4 hg18 rheMac2 ponAbe2 panTro2 equCab1 canFam2 bosTau3
     do
 	cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
 	time nice -n +19 netFilter -syn mm9.${DB}.net.gz \
 	    | gzip -c > mm9.${DB}.syn.net.gz
 	ls -og mm9.${DB}.syn.net.gz
 	md5sum mm9.${DB}.syn.net.gz >> md5sum.txt
     done
     for DB in calJac1 cavPor2 tupBel1 otoGar1 dasNov1 oryCun1 felCat3 \
 	loxAfr1 eriEur1 sorAra1 echTel1
     do
 	cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
 	ls -l mm9.${DB}.rbest.net.gz
 	md5sum mm9.${DB}.rbest.net.gz >> md5sum.txt
 	md5sum mm9.${DB}.rbest.chain.gz >> md5sum.txt
 	grep rbest md5sum.txt
     done
 
     #	create symlinks to make everything show up
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/mm9
     for DB in ?n4 ?g18 ?heMac2 ?onAbe2 ?anTro2 ?quCab1 ?anFam2 ?osTau3
 do
     ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz
     ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz \
 	vs${DB}/
     ls -Lld vs${DB}/mm9.*.syn.net.gz
 done
     for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
 	?oxAfr1 ?riEur1 ?orAra1 ?chTel1
 do
     ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
     ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
 	vs${DB}/
     ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
     grep rbest vs${DB}/md5sum.txt
 done
     for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
 	?oxAfr1 ?riEur1 ?orAra1 ?chTel1
 do
     ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
     ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
 	vs${DB}/
     ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
     ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz
     ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz \
 	vs${DB}/
     ls -Lld vs${DB}/mm9.${DB}.rbest.chain.gz
     grep rbest vs${DB}/md5sum.txt
 done
 
 ###########################################################################t
 #
 #  BUILD miRNA TRACK (DONE - 2007-10-05 - Fan)
     #   updated data from: Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     ssh hgwdev
     cd /cluster/data/mm9/bed
     mkdir miRNA-2007-10-05
     cd miRNA-2007-10-05
     # save the miRNAtrack-mm9.txt file from email
 
     cat miRNAtrack-mm9.txt|sed -e 's/ /\t/g' > miRNA.tab
 
     hgLoadBed mm9 miRNA miRNA.tab
 
 # Add the miRNA section to makeDb/trackDb/mouse/mm9/trackDb.ra
     vi ~/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra
 
 # check previous release track before update
     featureBits mm8 miRNA
     #33398 bases of 2567283971 (0.001%) in intersection
 
     featureBits mm9 miRNA
     #39718 bases of 2620346127 (0.002%) in intersection
 
 ###########################################################################t
 #  RE-BUILD miRNA TRACK (DONE – 2008-05-29 - Fan)
     #   updated data from: Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     ssh hgwdev
     cd /cluster/data/mm9/bed
     mkdir miRNA-2008-05-28
     cd miRNA-2008-05-28
     # save the mouse_miRNA_may2008.doc as mouse_miRNA_may2008.txt
     # and replace all blanks with tabs.
 
     cp mouse_miRNA_may2008.txt miRNA.tab
     hgLoadBed mm9 miRNA miRNA.tab
 
 # check previous release track before update
     featureBits mm8 miRNA
     #33398 bases of 2567283971 (0.001%) in intersection
 
     featureBits mm9 miRNA
     #43236 bases of 2620346127 (0.002%) in intersection
 
 #############################################################################
 # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     cd /cluster/data/mm9/bed/nscan/
     wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.gtf
     wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.prot.fa
     wget http://mblab.wustl.edu/predictions/mouse/mm9/readme.txt
     bzip2 mm9.*
     chmod a-w *
 
     mv ardor.wustl.edu/jeltje/mm9/chr_ptx .
     rm -rf ardor.wustl.edu
     rm chr_*/index.html*
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load track
     ldHgGene -bin -gtf -genePredExt mm9 nscanGene mm9.gtf.bz2
     hgPepPred mm9 generic nscanPep  mm9.prot.fa.bz2
     rm *.tab
 
     # update trackDb; need a mm9-specific page to describe informants
     mouse/mm9/nscanGene.html   (copy from hg18 and edit)
     mouse/mm9/trackDb.ra
     # changed search regex to
         termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
 
 #########################################################################
 # CPGISLANDS (DONE - 2007-10-25 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/cpgIsland
     cd /cluster/data/mm9/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ../..
     ln -s hg3rdParty/cpgIslands/cpglh.exe .
     
     # cpglh.exe requires hard-masked (N) .fa's.  
     #	make the hard masked sequences from these soft masked sequences
     ssh kkstore06
     time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
     do
 	echo "maskOutFa ${CHR} hard ${CHR}.masked"
 	nice -n +19 maskOutFa ${CHR} hard ${CHR}.masked
     done
     #	about 2 minutes
 
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     cd /cluster/data/mm9/bed/cpgIsland
     time for F in ../../*/chr*.fa.masked
     do
 	FA=${F/*\/}
 	C=${FA/.fa.masked/}
 	echo "./cpglh.exe ${FA} > ${C}.cpg"
 	nice -n +19 ./cpglh.exe ${F} > ${C}.cpg
     done > cpglh.out 2>&1 &
     #	about 3 minutes
 
     #	Several chroms have 0 results:
     #	-rw-rw-r--  1     0 Oct 25 11:11 chr16_random.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:12 chr3_random.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:12 chr5_random.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:13 chr7_random.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:13 chrM.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:13 chrX_random.cpg
     #	-rw-rw-r--  1     0 Oct 25 11:13 chrY.cpg
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     #	<< happy emacs
     awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/cpgIsland
     hgLoadBed mm9 cpgIslandExt -tab \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     #	Reading cpgIsland.bed
     #	Loaded 15963 elements of size 10
     featureBits mm9 cpgIslandExt
     #	10496250 bases of 2620346127 (0.401%) in intersection
     featureBits mm8 cpgIslandExt
     #	10456823 bases of 2567283971 (0.407%) in intersection
     featureBits mm7 cpgIslandExt
     #	10439328 bases of 2583394090 (0.404%) in intersection
     featureBits mm6 cpgIslandExt
     #	10432360 bases of 2597150411 (0.402%) in intersection
     featureBits mm5 cpgIslandExt
     #	10422989 bases of 2615483787 (0.399%) in intersection
     featureBits mm4 cpgIsland
     #	11109692 bases of 2627444668 (0.423%) in intersection
     featureBits mm3 cpgIsland
     #	10102968 bases of 2505900260 (0.403%) in intersection
 
 #############################################################################
 # LIFTOVER (DROPUNDER) TO MM8 (DONE - 2007-11-05 - Hiram)
     ssh kkstore06
     screen	# use a screen to control this job
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug mm9 mm8 \
       -ooc /san/sanvol1/scratch/mm9/11.ooc
     # Real run:
     cd /cluster/data/mm9/bed/blat.mm8.2007-11-05
     time nice -n +19 doSameSpeciesLiftOver.pl mm9 mm8 \
       -ooc /san/sanvol1/scratch/mm9/11.ooc > do.log 2>&1 &
 
 ########################################################################
 # ANNOTATE 30-WAY ALIGNMENT WITH QUALITY DATA (2007-11-07 rico at bx.psu.edu)
 #
 # The basic idea here is to create a qac file which has quality data for each
 # (chromosome/scaffold/etc) and then index the qac file.  Once this is done,
 # mafAddQRows can be used to add the quality data to a given maf.  The agp
 # files are used so that gaps can be represented in the qac files as a special
 # value.
 
     ## create .qac and .qdx files for each species in the 30-way alignment
     ## results are stored in /cluster/store12/rico/quality
     o human (hg18)
         Unable to find quality data.
 
     o chimp (panTro2)
         in.agp = cat /cluster/data/panTro2/wustl/*.agp > all.agp
         in.qac = /cluster/data/panTro2/bed/quality/qac/panTro2.qac
         qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx
 
     o rhesus (rheMac2)
         in.agp: /cluster/data/rheMac2/downloads
             (cat v1.edit4.chrome.ctgs.final.fix.agp; sed -e 's/^ChrUr/chrUr/' v1.edit4.ChrUr.ctgs.agp ) > all.agp
         in.qa = /cluster/data/rheMac2/qual/rheMac2.qual.qv.gz
         qaAgpToQacIdx in.agp in.qa rheMac2.qac rheMac2.qdx
 
     o bushbaby (otoGar1)
         http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
         in.agp = assembly.agp
         in.qa = Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa otoGar1.qac otoGar1.qdx
 
     o treeshrew (tupBel1)
         http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
         in.agp = assembly.agp
         in.qa = Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa tupBel1.qac tupBel1.qdx
 
     o rat (rn4)
         in.agp: /cluster/data/rn4
             #!/bin/sh
 
             rm -f rn4.agp
 
             for chrom in `awk '{print $1}' chrom.sizes`
             do
                 num=`echo $chrom | cut -dr -f2- | cut -d_ -f1`
                 if [ -f "$num/${chrom}.agp" ]; then
                     cat $num/${chrom}.agp >> rn4.agp
                 else
                     echo "Missing agp file for $chrom"
                     exit 1
                 fi
             done
         in.qa: /cluster/data/rn4/downloads
             #!/bin/sh
 
             rm -f rn4.qa
 
             for file in *.qual.gz
             do
                 echo -n "Processing $file ... "
                 chrom=`echo $file | sed -e 's/^Rnor3.4//;s/\.fa\.qual\.gz$//' | tr '-' '_'`
                 (echo ">$chrom" ; gzip -dc $file | tail +2) >> rn4.qa
                 echo "done."
             done
         qaAgpToQacIdx in.agp in.qa rn4.qac rn4.qdx
 
     o mouse (mm9)
         Unable to find quality data.
 
     o guinea pig (cavPor2)
         in.agp = /cluster/data/cavPor2/downloads/assembly.agp
         in.qa = /cluster/data/cavPor2/downloads/Draft_v2.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa cavPor2.qac cavPor2.qdx
 
     o rabbit (oryCun1)
         http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
         in.agp = assembly.agp
         in.qa = Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa oryCun1.qac oryCun1.qdx
 
     o shrew (sorAra1)
         in.agp = /cluster/data/sorAra1/downloads/assembly.agp
         in.qa = /cluster/data/sorAra1/downloads/Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa sorAra1.qac sorAra1.qdx
 
     o hedgehog (eriEur1)
         in.agp = /cluster/data/eriEur1/downloads/assembly.agp
         in.qa = /cluster/data/eriEur1/downloads/Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa eriEur1.qac eriEur1.qdx
 
     o dog (canFam2)
         in.agp = /cluster/store9/canFam2/broad/UCSC_Dog2.0.agp
         in.qac = /cluster/store9/canFam2/bed/quality/chrom.qac
         qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx
 
     o cat (felCat3)
         in.agp = /cluster/data/felCat3/downloads/assembly.agp
         in.qa = /cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa felCat3.qac felCat3.qdx
 
     o horse (equCab1)
         in.agp = /cluster/data/equCab1/downloads/assembly.agp
         in.qa = /cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
         qaAgpToQacIdx in.agp in.qa equCab1.qac equCab1.qdx
 
     o cow (bosTau3)
         in.agp = /cluster/data/bosTau3/fixup/UCSC.agp
         in.qac = /cluster/data/bosTau3/fixup/chrom.qac
         qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx
 
     o armadillo (dasNov1)
         /cluster/data/dasNov1/broad
             combineQuals assembly.agp.gz assembly.quals.gz combined.quals
             qaAgpToQacIdx assembly.agp.gz combined.quals dasNov1.qac dasNov1.qdx
 
     o elephant (loxAfr1)
         /cluster/data/loxAfr1/broad
             combineQuals assembly.agp assembly.quals.gz combined.quals
             qaAgpToQacIdx assembly.agp combined.quals loxAfr1.qac loxAfr1.qdx
 
     o tenrec (echTel1)
         /cluster/data/echTel1/broad
             combineQuals assembly.agp assembly.quals.gz combined.quals
             qaAgpToQacIdx assembly.agp combined.quals echTel1.qac echTel1.qdx
 
     o opossum (monDom4)
         /cluster/data/monDom4/broad.mit.edu
         in.qa = gzip -dc Monodelphis4.0.agp.chromosome.qual.gz \
             | sed -e 's/^>\([^.]*\)\.1-.*/>chr\1/;/^>.*Monodelphis4.0)/d' > monDom4.qa
         in.agp = Monodelphis4.0.agp
         qaAgpToQacIdx in.agp in.qa monDom4.qac monDom4.qdx
 
     o platypus (ornAna1)
         Unable to find quality data.
 
     o chicken (galGal3)
         Unable to find quality data.
 
     o lizard (anoCar1)
         in.agp = /cluster/data/anoCar1/downloads/assembly.agp
         in.qac = /cluster/data/anoCar1/downloads/scaffold.lifted.qac
         qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx
 
     o frog (xenTro2)
         Unable to find quality data.
 
     o tetraodon (tetNig1)
         Unable to find quality data.
 
     o fugu (fr2)
         Unable to find quality data.
 
     o stickleback (gasAcu1)
         in.agp = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp
         in.qa = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual
         qaAgpToQacIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx
 
     o medaka (oryLat1)
         in.agp = /cluster/data/oryLat1/downloads/chr.agp.txt-fixed
         in.qac = /cluster/data/oryLat1/bed/qual/fixed.chroms.qac
         qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx
 
     o zebrafish (danRer5)
         Unable to find quality data.
 
     o orangutan (ponAbe2)
         Unable to find quality data.
 
     o marmoset (calJac1)
         Unable to find quality data.
 
     ## copy all .qac and .qdx files to the san
     cp *.{qac,qdx} /san/sanvol1/rico/quality
 
     ## create species list (species.lst) containing the following
     anoCar1 /san/sanvol1/rico/quality
     bosTau3 /san/sanvol1/rico/quality
     canFam2 /san/sanvol1/rico/quality
     cavPor2 /san/sanvol1/rico/quality
     dasNov1 /san/sanvol1/rico/quality
     echTel1 /san/sanvol1/rico/quality
     equCab1 /san/sanvol1/rico/quality
     eriEur1 /san/sanvol1/rico/quality
     felCat3 /san/sanvol1/rico/quality
     gasAcu1 /san/sanvol1/rico/quality
     loxAfr1 /san/sanvol1/rico/quality
     monDom4 /san/sanvol1/rico/quality
     oryCun1 /san/sanvol1/rico/quality
     oryLat1 /san/sanvol1/rico/quality
     otoGar1 /san/sanvol1/rico/quality
     panTro2 /san/sanvol1/rico/quality
     rheMac2 /san/sanvol1/rico/quality
     rn4     /san/sanvol1/rico/quality
     sorAra1 /san/sanvol1/rico/quality
     tupBel1 /san/sanvol1/rico/quality
 
     ## the following script will add quality data to each of the mafs
 cat > addQData << 'EOF'
 #!/bin/sh
 
 INPUT_DIR=/cluster/data/mm9/bed/multiz30way/anno/maf
 OUTPUT_DIR=/cluster/data/mm9/bed/multiz30way/qual/maf
 
 for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
 do
     file=`basename $maf`
 
     mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
 done
 'EOF'
 
 # << emacs
 #########################################################################
 ### IGTC (Int'l GeneTrap Consortium) (DONE - 2007-10-01 - angie)
 ### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab
 
 ### NOTE -- the igtc track is automatically updated on hgwdev by the
 ### scripts monthlyUpdateIgtc.csh and updateIgtc.pl in
 ### kent/src/hg/utils/automation/ .
 
 #########################################################################
 # Load CCDS (2007-12-12 markd)
     # import ccds database as described in ccds.txt
     set db=mm9
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
 
 ############################################################################
 # Reload CCDS (2008-02-01 markd)
     # import ccds database as described in ccds.txt
     set db=mm9
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 
 
 ############################################################################
 # dbSNP BUILD 128 (DONE 2/8/08 angie)
 # updated snp128ExceptionDesc (tweaked wording) 3/11/08
     # Set up build directory
     ssh kkstore06
     mkdir -p /cluster/store3/dbSNP128/{mouse,shared}
 
     # dbSNP 128 field encodings (*.bcp.gz) were already downloaded -- 
     # see hg18.txt.  
 
     ########################## DOWNLOAD #############################
     cd /cluster/data/dbSNP/128/mouse
     mkdir data schema rs_fasta
     # Get data from NCBI (anonymous FTP)
     wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
     cd /cluster/data/dbSNP/128/mouse/data
     alias wg wget --timestamping
     set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/database
     # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
     wg $ftpSnpDb/organism_data/b128_SNPContigLoc_37_1.bcp.gz
     wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_37_1.bcp.gz
     wg $ftpSnpDb/organism_data/b128_ContigInfo_37_1.bcp.gz
     # MapInfo has alignment weights
     wg $ftpSnpDb/organism_data/b128_SNPMapInfo_37_1.bcp.gz
     # SNP has univar_id, validation status and heterozygosity
     wg $ftpSnpDb/organism_data/SNP.bcp.gz
 
     # Get schema
     cd /cluster/data/dbSNP/128/mouse/schema
     wg $ftpSnpDb/organism_schema/mouse_10090_table.sql.gz
 
     # Get fasta files
     # using headers of fasta files for molType, class, observed
     cd /cluster/data/dbSNP/128/mouse/rs_fasta
     wg ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/rs_fasta/\*.gz
 
 
     ########################## LOAD NCBI TABLES #############################
     # Simplify names of data files -- strip version & extras to get
     # local canonical table names.
     cd /cluster/data/dbSNP/128/mouse/data
     foreach f (*.bcp.gz)
       set new = `echo $f \
                  | sed -e 's/^b128_SNP//; s/^b128_//; s/_37_1//; s/.bcp//;'`
       mv $f $new
       echo $new
     end
 
     # Extract just the tables that we need from the NCBI msSQL table
     # creation file, and get CREATE statements from
     # mouse_10090_table.sql for our 5 tables
     cd /cluster/data/dbSNP/128/mouse/schema
 
     zcat mouse_10090_table.sql.gz \
     | perl -we '$/ = "\nGO\n\n\n"; \
         while (<>) { \
           next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
           s/b128_(SNP)?//; s/_37_1//; \
           s/[\[\]]//g;  s/GO\n\n\n/;/;  s/smalldatetime/datetime/g; \
           s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
           s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
           s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
           s/(image|varchar\s+\(\d+\))/BLOB/g; \
           print; \
         }' \
       > table.sql
 
     # load on kolossus or a small cluster machine (mysql5 is OK for this).
     ssh kkr3u00
     hgsql '' -e 'create database mm9snp128' 
     cd /cluster/data/dbSNP/128/mouse/schema
     hgsql mm9snp128 < table.sql
     cd ../data
 
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
       zcat $t.gz \
       | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
       | hgLoadSqlTab -oldTable mm9snp128 $t placeholder stdin
     end
     # There were some warnings (many cleared up by the perl substitution)
     # but no rows were dropped.  I eyeballed a few examples, seemed OK,
     # e.g. no value given for a field where NULL is OK.
     foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) 
      echo -n "${t}:\t"
       hgsql -N -B mm9snp128 -e 'select count(*) from '$t
     end
 #ContigInfo:     13636
 #ContigLoc:      31733892
 #ContigLocusId:  12883378
 #MapInfo:        28464204
 #SNP:            14380527
 
     # compare contig list in mm9.ctgPos vs ContigInfo (for the reference
     # strain, not the alts included in ContigInfo)
     ssh hgwdev hgsql mm9 -NBe 'select * from ctgPos;' \
       | sed -re 's/^(N[A-Z]_[0-9]+)\.[0-9]+/\1/;' \
       > ctgPos.tab
     awk '{print $1;}' ctgPos.tab | sort > /tmp/1
 
     # Take a look at the group_label values and choose a set that matches
     # the reference assembly:
     hgsql mm9snp128 -NBe 'select distinct(group_label) from ContigInfo'
     # Looks like just ref_strain will do.
 
     hgsql mm9snp128 -NBe 'select contig_acc from ContigInfo \
                           where group_label = "C57BL/6J"' \
     | sort > /tmp/2
     diff /tmp/1 /tmp/2 
     # No diff, good.
     # Make sure there are no orient != 0 contigs among those selected.
     hgsql mm9snp128 -NBe \
       'select count(*) from ContigInfo where orient != 0 and \
          group_label = "C57BL/6J";'
 #0
 
     #################### EXTRACT INFO FROM NCBI TABLES ####################
     mkdir -p /scratch/snp/128/mouse
     cd /scratch/snp/128/mouse
 
     time hgsql mm9snp128 -e \
       'alter table ContigLoc  add index (ctg_id); \
        alter table ContigInfo add index (ctg_id);'
 #0.002u 0.001s 6:18.71 0.0%      0+0k 0+0io 1pf+0w
 
     time hgsql mm9snp128 -e \
       'alter table ContigInfo add index (group_label(9));'
 #0.002u 0.002s 0:00.35 0.0%      0+0k 0+0io 1pf+0w
 
     # Since there is only one group_label for mouse, just use snp_id
     # as key.  If there is more than one group_label to pick up, then
     # don't use this as a template -- use hg18.txt.
     hgsql mm9snp128 -NBe \
       'select snp_id, ContigInfo.contig_acc, asn_from, asn_to, \
               loc_type, orientation, allele, phys_pos_from \
        from ContigLoc, ContigInfo \
        where ContigLoc.ctg_id = ContigInfo.ctg_id and  \
              ContigInfo.group_label = "C57BL/6J";' \
       | sort \
       > ucscContigLoc.txt
     # took ~7 minutes
     # The IDs are non-unique (can be multiply mapped).  This is OK if 
     # everything else that we relate to these uniquely maps to snp_id.
     wc -l ucscContigLoc.txt
 #16232825 ucscContigLoc.txt
     awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
 #14304640
 
     # SNP -> valid, avHet, avHetSE
     # SNP has only snp_id as identifier, nothing relating to assembly.
     hgsql mm9snp128 -NBe \
       'select snp_id, validation_status, avg_heterozygosity, het_se \
        from SNP;' \
     | sort \
       > ucscSNP.txt
     # Check ID uniqueness:
     wc -l ucscSNP.txt
 #14380527 ucscSNP.txt
     awk '{print $1;}' ucscSNP.txt | uniq | wc -l
 #14380527
 
     # ContigLocusId -> func
     # ContigLocusId has only snp_id as an identifier (it gives one 
     # example contig if the SNP is on multiple contigs).  
     # The sort options and awk are to convert multiple entries with different
     # function classes for the same SNP into one entry per SNP with a list
     # of function classes.
     hgsql mm9snp128 -NBe \
       'select snp_id, fxn_class from ContigLocusId;' \
     | sort -u -k1,1 -k2,2n  \
     | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
             else { if (prevId) {print prevId "\t" prevFunc;} \
                                 prevFunc = $2 ","; }} \
            {prevId = $1;} \
            END {print prevId "\t" prevFunc;}' \
       > ucscFunc.txt
     # Check ID uniqueness:
     wc -l ucscFunc.txt
 #5878591 ucscFunc.txt
     awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
 #5878591
 
     # MapInfo -> weight
     # MapInfo needs assembly+snp_ids in order to have unique IDs.
     time hgsql mm9snp128 -e \
       'alter table MapInfo add index (assembly(9));'
 #0.000u 0.004s 2:22.64 0.0%      0+0k 0+0io 0pf+0w
     hgsql mm9snp128 -NBe \
       'select snp_id, weight from MapInfo where assembly = "C57BL/6J";' \
       | sort \
       > weight.txt
     # ~1 minute
     # Check ID uniqueness:
     wc -l weight.txt
 #14304640 weight.txt
     awk '{print $1;}' weight.txt | uniq | wc -l
 #14304640
     awk '{print $2;}' weight.txt | sort -n | uniq -c
 #13954580 1
 # 113119 2
 # 169755 3
 #  67186 10
     # SNPs w/weight 0 and 10 will be discarded later.
 
     # fasta headers -> observed, molType, class
     zcat /cluster/data/dbSNP/128/mouse/rs_fasta/rs_ch*.fas.gz \
     | grep '^>gnl' \
     | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
     | sort \
       > ucscGnl.txt
     # ~4 minutes
     wc -l ucscGnl.txt
 #14380527 ucscGnl.txt
     awk '{print $1;}' ucscGnl.txt | uniq | wc -l
 #14380527
 
     ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
     # Join files by ID.  
     time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
       > ucscCL+w.txt
 #26.811u 4.091s 1:02.59 49.3%    0+0k 0+0io 0pf+0w
     wc -l ucscCL+w.txt 
 #16232825 ucscCL+w.txt
     # Same as ucscContigLoc.txt above, good.
     # Any missing weights?
     grep MISSING ucscCL+w.txt | head
     # No output, good.
 
     # Join the files with SNP-only IDs.
     time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
       > ucscG+S.txt
 #16.591u 1.935s 0:28.44 65.1%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S.txt
 #14380527 ucscG+S.txt
     # Same as ucscSNP.txt and ucscGnl.txt above.
     grep MISSING ucscG+S.txt | wc -l
 #0
     time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
       -t '	' ucscG+S.txt ucscFunc.txt \
       > ucscG+S+F.txt
 #17.438u 2.115s 0:24.83 78.6%    0+0k 0+0io 0pf+0w
     wc -l ucscG+S+F.txt
 #14380527 ucscG+S+F.txt
     grep MISSING ucscG+S+F.txt | wc -l 
 #8501936
     # Not surprising -- ucscFunc.txt has only 5878591 lines.
     expr 14380527 - 5878591
 #8501936
 
     # Final join -- treat ContigLoc as authoritative (since it has coords).
     # Arrange columns in same order as in the SNP table, with extras for
     # checking at the end (phys_pos_from).
     # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
     time join -a 1 -e MISSING -t '	' \
   -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
       ucscCL+w.txt ucscG+S+F.txt \
       > ucscNcbiSnp.ctg.txt
 #41.401u 6.045s 1:02.04 76.4%    0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.ctg.txt
 #16232825 ucscNcbiSnp.ctg.txt
     grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
 #8432812
     # a bit less than the 8501936 missing FUNC's above... perhaps some
     # of those did not have any mappings in ucscContigLoc.txt.
 
     # Lift the map contig coordinates to chrom coordinates (~2m);
     sed -re 's/\t(N[A-Z]_[0-9]+)\.[0-9]+\t/\t\1\t/;' \
       /cluster/data/mm9/jkStuff/mm9.contigs.lift > liftContigs.lft
     time liftUp ucscNcbiSnp.bed liftContigs.lft warn ucscNcbiSnp.ctg.txt
 #131.007u 7.438s 2:26.48 94.5%   0+0k 0+0io 0pf+0w
     wc -l ucscNcbiSnp.bed
 #16232825 ucscNcbiSnp.bed
 
     # At this point, move back from /scratch to /cluster/data.
     nice gzip ucscNcbiSnp.bed
     cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/mouse/
 
     # Translate NCBI's encoding into UCSC's, and perform a bunch of
     # checks.  This is where developer involvement is most likely as
     # NCBI extends the encodings used in dbSNP.
     cd /cluster/data/dbSNP/128/mouse/
     gunzip ucscNcbiSnp.bed.gz
     time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/mm9/mm9.2bit \
       snp128
 #count of snps with weight  0 = 0
 #count of snps with weight  1 = 13954580
 #count of snps with weight  2 = 226238
 #count of snps with weight  3 = 712684
 #count of snps with weight 10 = 1339323
 #Found no errors.
 #162.963u 9.783s 3:02.77 94.5%   0+0k 0+0io 1pf+0w
     wc -l snp*
 #  14893502 snp128.bed
 #        22 snp128.sql
 #         0 snp128Errors.bed
 #        18 snp128ExceptionDesc.tab
 #   1898314 snp128Exceptions.bed
 
     # Make one big fasta file.  (note: snp126 skipped chrUn... but it's small
     # compared to chr1, chr2 etc.)
     # Some of the fasta files have SNPs that were not mapped to the reference
     # assembly.  Make sure there is no overlap with snp128.bed, and then
     # move then out of the way.
     zcat rs_fasta/rs_chNotOn.fas.gz \
     | perl -we 'while (<>) { \
                   next unless /^>gnl/; s/^>gnl.dbSNP.(rs\d+).*/$1/; print; }' \
     | sort | grep -Fwf - snp128.bed | head
     ^chNotOn^chAltOnly
     # No output from either command -- good.
     mkdir rs_fasta/omitted
     mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/omitted/
 
     zcat rs_fasta/rs_ch*.fas.gz \
     | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
       > snp128.fa
     # Check for duplicates.
     grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
     wc -l /scratch/tmp/seqHeaders
 #14304640 /scratch/tmp/seqHeaders
     uniq /scratch/tmp/seqHeaders | wc -l
 #14304640
     # Use hgLoadSeq to generate .tab output for sequence file offsets,
     # and keep only the columns that we need: acc and file_offset.
     # Index it and translate to snpSeq table format.
     time hgLoadSeq -test placeholder snp128.fa
 #42.866u 4.977s 0:48.09 99.4%    0+0k 0+0io 4pf+0w
     cut -f 2,6 seq.tab > snp128Seq.tab
     rm seq.tab
 
     ssh hgwdev
     # Load up main track tables.
     cd /cluster/data/dbSNP/128/mouse
     time nice hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
       mm9 snp128 -sqlTable=snp128.sql snp128.bed
 #Loaded 14893502 elements of size 17
 #67.395u 12.818s 8:43.01 15.3%   0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
       > snp128Exceptions.sql
     time nice hgLoadBed -tab -onServer -tmpDir=/scratch/tmp \
       mm9 snp128Exceptions -sqlTable=snp128Exceptions.sql \
       snp128Exceptions.bed
 #Loaded 1898314 elements of size 5
 #8.925u 1.354s 0:52.66 19.5%     0+0k 0+0io 0pf+0w
     sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
       > snp128ExceptionDesc.sql
     # 3/11/08: reloaded snp128ExceptionDesc (tweaked wording)
     hgLoadSqlTab mm9 snp128ExceptionDesc snp128ExceptionDesc.sql \
       snp128ExceptionDesc.tab
     # Load up sequences.
     sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
       > snp128Seq.sql
     mkdir -p /gbdb/mm9/snp
     ln -s /cluster/data/dbSNP/128/mouse/snp128.fa /gbdb/mm9/snp/snp128.fa
     time nice hgLoadSqlTab mm9 snp128Seq snp128Seq.sql snp128Seq.tab
 #0.000u 0.003s 3:02.66 0.0%      0+0k 0+0io 0pf+0w
     # Put in a link where one would expect to find the track build dir...
     ln -s /cluster/data/dbSNP/128/mouse /cluster/data/mm9/bed/snp128
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11,12 - Hiram)
     ssh kkstore06
     screen # use a screen to manage this multi-day job
     mkdir /cluster/data/mm9/bed/blastzBosTau4.2008-03-11
     cd /cluster/data/mm9/bed/blastzBosTau4.2008-03-11
 
     cat << '_EOF_' > DEF
 BLASTZ_M=50
 
 # TARGET: Human Hg18
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau4
 SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
 SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=200
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzBosTau4.2008-03-11
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -syntenicNet > do.log 2>&1 &
     #	real    460m51.297s
     cat fb.mm9.chainBosTau4Link.txt
     #	690095394 bases of 2620346127 (26.336%) in intersection
 
     mkdir /cluster/data/bosTau4/bed/blastz.mm9.swap
     cd /cluster/data/bosTau4/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzBosTau4.2008-03-11/DEF \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	    -swap -syntenicNet > swap.log 2>&1 &
     #	real    117m39.571s
     cat fb.bosTau4.chainMm9Link.txt
     #	707444627 bases of 2731830700 (25.896%) in intersection
 
 #######################################################################
 # BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-04-14 - Hiram)
     ssh kkstore06
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
     cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
 
     cat << '_EOF_' > DEF
 # Mouse vs. Lamprey
 
 # using the "distant" genome alignment parameters
 #	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Mouse
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Lamprey petMar1
 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
 SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzPetMar1.2008-04-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=pk > do.log 2>&1 &
 
     cat fb.mm9.chainPetMar1Link.txt
     #	29113438 bases of 2620346127 (1.111%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
     cd /cluster/data/petMar1/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
     #	real    33m29.076s
     cat  fb.petMar1.chainMm9Link.txt
     #	26052507 bases of 831696438 (3.132%) in intersection
 
 #######################################################################
 # BLASTZ/CHAIN/NET Lanclet broFla1 (DONE - 2008-04-14 - Hiram)
     ssh kkstore06
     screen # use screen to control this job
     mkdir /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
     cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
 
     cat << '_EOF_' > DEF
 # Mouse vs. Lanclet
 
 # using the "distant" genome alignment parameters
 #	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Mouse
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
 #       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
 SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
 SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
 SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
 SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=kk > do.log 2>&1 &
     #	real    408m36.691s
     cat fb.mm9.chainBraFlo1Link.txt
     #	26725980 bases of 2620346127 (1.020%) in intersection
 
     #	That is OK, now for the swap:
     mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
     cd /cluster/data/braFlo1/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
     #	real    12m23.402s
     cat  fb.braFlo1.chainMm9Link.txt
     #	31517169 bases of 923355587 (3.413%) in intersection
 
 ###########################################################################
 #  LOAD Transcriptome data (DONE - 2008-05-06 - Hiram)
     # data from Christian Iseli 'Christian.Iseli at licr.org'
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/transcriptome
     cd /cluster/data/mm9/bed/transcriptome
     wget --timestamping ftp://ftp.licr.org/pub/MTr.gtf.gz
     wget --timestamping ftp://ftp.licr.org/pub/txg.tar.gz
     gtfToGenePred -genePredExt MTR.gtf.gz MTr.gp
     hgLoadGenePred mm9 transcriptome -genePredExt MTr.gp
 
     tar xvzf txg.tar.gz
     # Do a little data cleanup and transformation and
     #	load splice graphs into database.
     sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \
 	> sibTxGraph.sql
     cat txg/*.txg | txgToAgx stdin stdout \
 	| hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm9 sibTxGraph stdin
     #	Loaded 52065 elements of size 18
 
    # Create sibAltEvents track for analysed alt-splices.
    cat txg/*.txg \
 	| txgAnalyze stdin /cluster/data/mm9/mm9.2bit stdout \
 	| awk '$2 >= 0' | sort | uniq > sibAltEvents.bed
    hgLoadBed mm9 sibAltEvents sibAltEvents.bed
 
 #############################################################################
 # BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-17 - larrym)
     ssh kkstore04
     screen #	use screen to control this multi-day job
     mkdir /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
     cd /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
     cat << '_EOF_' > DEF
 # Mouse vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Mouse MM9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes 
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastz.equCab2.2008-04-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-verbose=2 -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &
 
     ln -s blastz.equCab2.2008-04-15 /cluster/data/mm9/bed/blastz.equCab2
 
 ############################################################################
 # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
     # import ccds database as described in ccds.txt
     set db=mm9
     set ncbiBld=37.1
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 ############################################################################
 #  update vega genes to version 31 (v49 of Ensembl genes)
 #	(DONE - 2008-05-15 - Hiram)
     mkdir  /cluster/data/mm9/bed/vega31_49
     cd  /cluster/data/mm9/bed/vega31_49
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/CHANGELOG.gz"
     wget --timestamping \
 	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/catalog.txt"
     wget --timestamping \
 "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/Mus_musculus.VEGA.apr.pep.tot.fa.gz"
 
     #	processing similar to the same processing for Ensembl genes,
     #	from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
     cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
     zcat gtf_file.gz \
         | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
         | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
         | gzip > allGenes.gtf.gz
 
     gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
 	| gzip > mm9.allGenes.gp.gz
     /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
 	infoOut.txt > ensGtp.tab
     genePredCheck -db=mm9 mm9.allGenes.gp.gz
     #	checked: 54208 failed: 0
     zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
     zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
     gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
     gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
     genePredCheck -db=mm9 pseudo.gp
     #	checked: 3989 failed: 0
     genePredCheck -db=mm9 not.pseudo.gp
     #	checked: 50219 failed: 0
     hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
     hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
 
 ############################################################################
 # BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate)
 
     ssh kkstore06
     cd /cluster/data/mm9/bed
     mkdir blastzSpeTri0.2008-05-16
     cd blastzSpeTri0.2008-05-16
 
     cat << '_EOF_' > DEF
 # Mouse vs. Ground squirrel
 
 BLASTZ_M=50
 
 # TARGET: Mouse MM9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes 
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Ground squirrel speTri0
 SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit
 SEQ2_LEN=/cluster/data/speTri0/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LIMIT=500
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzSpeTri0.2008-05-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
       -chainMinScore=3000 -chainLinearGap=medium >& do.log &
 
     ln -s blastzSpeTri0.2008-05-16 /cluster/data/mm9/bed/blastz.speTri0
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastz.speTri0
     /cluster/bin/scripts/doRecipBest.pl mm9 speTri0 >&! rbest.log &
 
     # failed coverage check, shouldn't be fatal ?
     # resume creating axt's and maf's
     # use axtChain/doRecipBest.csh to create resume.csh
 
     ssh kkstore06
     cd /cluster/data/mm9/bed/blastz.speTri0/axtChain
     csh resume.csh >&! resume.log &
 
     ssh hgwdev
     cd /cluster/data/mm9/bed/blastz.speTri0
     featureBits mm9 chainSpeTri0Link > fb.mm9.chainSpeTri0Link.txt
     cat fb.mm9.chainSpeTri0Link.txt
     # 673393210 bases of 2620346127 (25.699%) in intersection
 
 #################
 # Rodent multiz (mouse, guinea pig, ground squirrel) 
 # for Jurgen Schmitz (2008-06-07 kate)
 # Redo with unfiltered net mafs, to maximize squirrel sequence
 
     ssh kkstore06
     mkdir /cluster/data/mm9/bed/multiz3way
     cd /cluster/data/mm9/bed/multiz3way
     mkdir mafLinks
     mkdir mafLinks/cavPor3
     cd mafLinks/cavPor3
     # high quality mammalian genome, so use syntenic net
     ln -s ../../../blastz.cavPor3/mafSynNet/*.maf.gz .
     mkdir ../speTri0
     cd ../speTri0
     # low coverage genome, so use reciprocal best
     #ln -s ../../../blastz.speTri0/mafRBestNet/*.maf.gz .
     # redo with unfiltered, to get more squirrel sequence
     ln -s ../../../blastz.speTri0/maftNet/*.maf.gz .
 
     #	Copy MAFs to kluster-friendly disk 
     mkdir -p /san/sanvol1/scratch/mm9/multiz3way
     cd /san/sanvol1/scratch/mm9/multiz3way
     rsync -a --copy-links --progress \
 	/cluster/data/mm9/bed/multiz3way/mafLinks/ .
 
     # get latest PSU utilities
     mkdir penn
     set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
     cp -p $p/{autoMZ,multiz,maf_project} penn
 
     # the autoMultiz cluster run
     ssh pk
     cd /cluster/data/mm9/bed/multiz3way
 
     # create species list and stripped down tree for autoMZ
     cat > tree.nh << 'EOF'
 ((mm9 cavPor3) speTri0)
 'EOF'
     cat > species.lst << 'EOF'
 mm9 cavPor3 speTri0
 'EOF'
     mkdir run maf
     cd run
 
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = mm9
 set c = $1
 set maf = $2
 set binDir = /san/sanvol1/scratch/$db/multiz3way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz3way
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
     # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 ./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz3way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 35 jobs
     para try
 
     para check
 
 #Completed: 35 of 35 jobs
 #CPU time in finished jobs:       6086s     101.43m     1.69h    0.07d  0.000 y
 #IO & Wait Time:                   240s       4.00m     0.07h    0.00d  0.000 y
 #Average job time:                 181s       3.01m     0.05h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             502s       8.37m     0.14h    0.01d
 #Submission to last job:           506s       8.43m     0.14h    0.01d
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/mm9
     mkdir multizRodent3way
     cd multizRodent3way
     ln -s /cluster/data/mm9/bed/multiz3way/maf .
     cat > README.txt << 'EOF'
 This directory contains multiple alignments of 2 rodent genome
 assemblies to the mouse genome (mm9, Mar. 2006):
 
     _ guinea pig         Cavia porcellus                Feb. 2008, cavPor3
     _ ground squirrel    Spermophilus tridecemlineatus  Jun. 2006, speTri0
 
 'EOF'
 # << emacs
 
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 #########################################################################
 # ORegAnno - Open Regulatory Annotations
 # loaded July 7, 2008
 # updated Sept 29, 2008
 # loaded by Belinda Giardine, in same manner as hg18 ORegAnno track
 
 
 ############################################################################
 # JAX/MGI TRACKS (DONE 8/20/09 angie)
 # Previously done 6/11/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (pushed)
 # Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_04 (not pushed)
 # Previously done 9/24/08 in /cluster/data/mm9/bed/jax/2008_09
     mkdir -p /hive/data/genomes/mm9/bed/jax/2009_08
     cd /hive/data/genomes/mm9/bed/jax/2009_08
     wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
     wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
 
     # Jax Rep Transcript track
     # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
     # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
     # -- aliases ~ MGI:\d+
     # Use simple perl script to uniquify transcript names and make alias.tab.
     # (Copied /hive/data/genomes/mm8/bed/jax/2007_07/parseRepTranscript.pl and
     # modified to tweak a regex for tweaked name NR_027008_Gt(ROSA)26Sor_1)
     ../2009_06/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
     | sed -e 's/^/chr/; s/chrMT/chrM/;' \
       > jaxRepTranscript.gff
 
     # Jax Allele track
     # AL_*.gff --> jaxAllele{,Info}
     # -- bed12Source -- add type from filename
     # -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
     # -- Info: name, mgiID, source {"Gene trapped", ...}
     rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
     foreach f (AL*.gff)
       set type = `echo $f:t:r \
         | sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \
             s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \
             s/TRANS/Transgenic/;'`
       /hive/data/genomes/mm8/bed/jax//2007_09/parseAllele.pl $f \
       | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
       | /cluster/bin/scripts/genePredToBed \
       | sed -e 's/^/chr/; s/$/'"\t$type"'/;' \
       >> jaxAllele.bed
     end
     # This round's formatting inconsistencies:
 #source not given for NM_010230_Fmn1<ld-Is(17_In2)1Gso>
 #source not given for NM_015770_a<jIs(17_In2)1Gso>
 #source not given for NM_009521_Wnt3<In(11Trp53_11Wnt3)8Brd>
 #source not given for NM_001127233_Trp53<In(11Trp53_11Wnt3)8Brd>
 #source not given for NM_029931_Mllt3<T(4Mllt3_9Mll)1Thr>
 
     # Jax Phenotype track
     # MP_*.gff --> jaxPhenotype{,Alias}
     # -- bed12Source -- add type from filename
     # -- names like NM_001001488_Atp8b1
     rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
     foreach f (MP_*.gff)
       set type = `echo $f:t:r \
         | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
                     s@AdiposeTissue@Adipose@ || \
                     s@BehaviorNeurological@Behavior@ || \
                     s@CardiovascularSystem@Cardiovascular@ || \
                     s@DigestiveAlimentary@Digestive@ || \
                     s@EndocrineExocrineGland@Gland@ || \
                     s@GrowthSize@Growth Size@ || \
                     s@HearingEar@Hearing/Ear@ || \
                     s@HematopoieticSystem@Hematopoietic@ || \
                     s@HomeostasisMetabolism@Homeostasis@ || \
                     s@ImmuneSystem@Immune@ || \
                     s@LethalityEmbryonicPerinatal@Embryonic Lethal@ || \
                     s@LethalityPostnatal@Postnatal Lethal@ || \
                     s@LifeSpanPostWeaningAging@Life Span@ || \
                     s@LimbsDigitsTail@Limbs and Tail@ || \
                     s@LiverBiliarySystem@Liver and Bile@ || \
                     s@NervousSystem@Nervous System@ || \
                     s@RenalUrinarySystem@Renal/Urinary@ || \
                     s@ReproductiveSystem@Reproductive@ || \
                     s@RespiratorySystem@Respiratory@ || \
                     s@SkinCoatNails@Skin/Coat/Nails@ || \
                     s@TasteOlfaction@Taste/Smell@ || \
                     s@TouchVibrissae@Touch@ || \
                     s@Tumorigenesis@Tumorigenesis@ || \
                     s@VisionEye@Vision/Eye@ || \
                     m/^Craniofacial|Cellular|Embryogenesis|Muscle|Normal|Other|Pigmentation|Skeleton|$/ || \
                     die "Unrec $_";'`
       echo $type
       /hive/data/genomes/mm8/bed/jax/2006_10/parsePhenotype.pl $f \
       | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
       | /cluster/bin/scripts/genePredToBed \
       | sed -e 's/^/chr/; s@$@'"\t$type"'@;' \
       >> jaxPhenotype.bed
     end
     sort -u jaxPhenotypeAlias.tab > tmp
     mv tmp jaxPhenotypeAlias.tab
 
     # Jax QTL track
     # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
     # and CM distance for 2, or those plus flanking markers for 3...
     cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff 
     # No output, so skip this part:
     if (0)
       perl -wpe 'chomp; s/\s*$//; \
         ($c, undef, undef, $start, $end, undef, $strand, undef, $info) = \
           split("\t"); \
         if ($info =~ /QTL (\S+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \
           ($name, $mgiID, $desc) = ($1, $2, $3); \
         } else { die "parse\n$info"; } \
         if ($start > $end) { $tmp = $end; $end = $start; $start = $tmp; } \
         $start-- unless $start == 0; \
         s/^.*$/chr$c\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
         MGI_QTL.gff > jaxQtl.bed
     endif
 
     # Extract phenotype-allele relationships:
     # Make a file for the one code not already in a filename:
     cp /dev/null MP_0003012_no_phenotypic_analysis
     # Wrote a script to extract the phenotype-allele relationships --
     # it uses the filenames to map MP:* codes to our phenotype names.
     /hive/data/genomes/mm8/bed/jax/2007_07/parsePhenotypicAllele.pl \
       MGI_PhenotypicAllele.rpt > jaxAllelePheno.tab
     # The file "err" has messages about missing data (no gene name in 
     # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).
 
     # Load tables
     # jaxRepTranscript
     ldHgGene mm9 jaxRepTranscript jaxRepTranscript.gff
 #35505 gene predictions
     hgsql mm9 < fixJaxRepTranscript.sql
     hgLoadSqlTab mm9 jaxRepTranscriptAlias \
       ~/kent/src/hg/lib/genericAlias.sql jaxRepTranscriptAlias.tab
     checkTableCoords mm9 jaxRepTranscript
     # jaxAllele
     hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
       mm9 jaxAllele jaxAllele.bed
 #Loaded 15904 elements of size 13
     # fixJaxAllele.sql is empty so don't need to do this:
     # hgsql mm9 < fixJaxAllele.sql
     hgLoadSqlTab mm9 jaxAlleleInfo \
       ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
     # jaxPhenotype
     hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
       -tab mm9 jaxPhenotype jaxPhenotype.bed
 #Loaded 32922 elements of size 13
     # fixJaxPhenotype.sql is empty so don't need to do this:
     # hgsql mm9 < fixJaxPhenotype.sql
     hgLoadSqlTab mm9 jaxPhenotypeAlias \
       ~/kent/src/hg/lib/genericAlias.sql jaxPhenotypeAlias.tab
     # jaxQtl
     cmp MGI_QTL.gff ../2009_06/MGI_QTL.gff 
     # No output ==> no data change, skip the following lines:
 #    hgLoadBed -tab -notItemRgb -noBin \
 #      -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \
 #      mm9 jaxQtl jaxQtl.bed
     checkTableCoords -verbose=2 mm9 jaxQtl
     # No output, good.
     # phenotype-allele relationships
     hgLoadSqlTab mm9 jaxAllelePheno \
       ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
 
     # Check joiner:
     runJoiner.csh mm9 jaxRepTranscript
     runJoiner.csh mm9 jaxAllele
     runJoiner.csh mm9 jaxPhenotype
 
 
 ############################################################################
 # WOLD RNA-seq
 #
 # wig files:  bed format, 25mers
 ave mm9Brain.wig
 #min 1, max=12989, median, 6
 #7.4M reads
 
 woldRnaSeqBrain
 
 ##########################################################################
 # Fix equCab2 nets and chains to remove duplicate scaffold_34 (DONE - 2008-08-19 - larrym)
 
 fixChainNetEquCab2 hg18
 
 deleted:	3100 from chr1_chainEquCab2
 deleted:	7362 from chr10_chainEquCab2
 deleted:	8472 from chr11_chainEquCab2
 deleted:	1078 from chr12_chainEquCab2
 deleted:	2227 from chr13_chainEquCab2
 deleted:	2 from chr13_random_chainEquCab2
 deleted:	3605 from chr14_chainEquCab2
 deleted:	6773 from chr15_chainEquCab2
 deleted:	3400 from chr16_chainEquCab2
 deleted:	0 from chr16_random_chainEquCab2
 deleted:	3741 from chr17_chainEquCab2
 deleted:	3 from chr17_random_chainEquCab2
 deleted:	334 from chr18_chainEquCab2
 deleted:	5620 from chr19_chainEquCab2
 deleted:	5 from chr1_random_chainEquCab2
 deleted:	23003 from chr2_chainEquCab2
 deleted:	1265 from chr3_chainEquCab2
 deleted:	0 from chr3_random_chainEquCab2
 deleted:	2567 from chr4_chainEquCab2
 deleted:	0 from chr4_random_chainEquCab2
 deleted:	967 from chr5_chainEquCab2
 deleted:	0 from chr5_random_chainEquCab2
 deleted:	3419 from chr6_chainEquCab2
 deleted:	10493 from chr7_chainEquCab2
 deleted:	0 from chr7_random_chainEquCab2
 deleted:	1284 from chr8_chainEquCab2
 deleted:	1 from chr8_random_chainEquCab2
 deleted:	10185 from chr9_chainEquCab2
 deleted:	1 from chr9_random_chainEquCab2
 deleted:	4 from chrM_chainEquCab2
 deleted:	8 from chrUn_random_chainEquCab2
 deleted:	1585 from chrX_chainEquCab2
 deleted:	3 from chrX_random_chainEquCab2
 deleted:	19 from chrY_chainEquCab2
 deleted:	70 from chrY_random_chainEquCab2
 deleted:	18173 from netEquCab2
 
 #########################################################################
 # BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-25,27 - Hiram)
     ssh kkstore06
     screen	# use a screen to manage this longish running job
     mkdir /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
     cd /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
     cat << '_EOF_' > DEF
 # Mouse vs. Medaka
 BLASTZ=/cluster/bin/penn/x86_64/lastz
 
 # typical parameters for a genome that is distant from human
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Mouse mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=1
 
 # QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
 SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
 SEQ2_CHUNK=40000000
 SEQ2_LIMIT=200
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/blastzOryLat2.2008-08-25
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
     #	real    124m28.816s
     #	problems with memk today, continuing:
     time doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat -qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > cat.log 2>&1 &
     #	the kluster is acting up, took several attempts to get one of the
     #	simple cat jobs done, not sure why it was having trouble, continuing:
     time doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-continue=chainRun -qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainRun.log 2>&1 &
     time doBlastzChainNet.pl `pwd`/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -continue=chainMerge -qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
     #	real    14m58.355s
     cat fb.mm9.chainOryLat2Link.txt
     #	50975949 bases of 2620346127 (1.945%) in intersection
 
     cd /cluster/data/mm9/bed
     ln -s blastzOryLat2.2008-08-25 blastz.oryLat2
     
     #	That is OK, now for the swap:
     mkdir /cluster/data/oryLat2/bed/blastz.mm9.swap
     cd /cluster/data/oryLat2/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 -swap \
 	/cluster/data/mm9/bed/blastzOryLat2.2008-08-25/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-qRepeats=windowmaskerSdust \
 	-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
     #	real    15m26.642s
     cat fb.oryLat2.chainMm9Link.txt
     #	45837267 bases of 700386597 (6.545%) in intersection
 
 #######################################
 # Wold RNA-seq data (Done Jul 30 mikep)
 #
 
 df .
 #Filesystem           1K-blocks      Used Available Use% Mounted on
 #kkstore06-10:/export/cluster/store4
 #                     2402304448 2183573728  96700640  96% /cluster/store4
 ssh kkstore06
 cd /cluster/store4/mm9/bed/woldRnaSeq/
 
 # naming convention: woldRnaSeq (Signal) Tissue Replicate
 
 # rename input wigs to convention
 mv mm9Brain.wig   woldRnaSeqSignalBrain1.wigbed
 mv mm9Brain2.wig  woldRnaSeqSignalBrain2.wigbed
 mv mm9Liver.wig   woldRnaSeqSignalLiver1.wigbed
 mv mm9Liver2.wig  woldRnaSeqSignalLiver2.wigbed
 mv mm9Muscle.wig  woldRnaSeqSignalMuscle1.wigbed
 mv mm9Muscle2.wig woldRnaSeqSignalMuscle2.wigbed
 
 # wigEncode it all
 for T in Brain Liver Muscle
 do 
   for R in 1 2
   do
   wigEncode woldRnaSeqSignal${T}${R}.wigbed woldRnaSeqSignal${T}${R}.wig woldRnaSeqSignal${T}${R}.wib
   done
 done
 
 #Converted woldRnaSeqSignalBrain1.wigbed, upper limit 12989.00, lower limit 1.00
 #Converted woldRnaSeqSignalBrain2.wigbed, upper limit 1482.24, lower limit 0.04
 #Converted woldRnaSeqSignalLiver1.wigbed, upper limit 44652.00, lower limit 1.00
 #Converted woldRnaSeqSignalLiver2.wigbed, upper limit 2567.53, lower limit 0.06
 #Converted woldRnaSeqSignalMuscle1.wigbed, upper limit 60949.00, lower limit 1.00
 #Converted woldRnaSeqSignalMuscle2.wigbed, upper limit 2726.96, lower limit 0.06
 
 # Load on hgwdev
 ssh hgwdev
 
 for T in Brain Liver Muscle
 do 
   for R in 1 2
   do
   ln -s /cluster/data/mm9/bed/woldRnaSeq/woldRnaSeqSignal${T}${R}.wib /gbdb/mm9/wib/
   hgLoadWiggle mm9 woldRnaSeqSignal${T}${R} woldRnaSeqSignal${T}${R}.wig 
   done
 done
 rm wiggle.tab
 
 # do the beds
 for F in data/*beds*tgz
 do
   echo "untaring $F"
   tar zxvf $F
 done
 
 # How many records in the beds?
  wc -l *bed
 #   8868804 mm9Brain1.multi.bed
 #    856281 mm9Brain1.splices.bed
 #  14488584 mm9Brain1.uniqs.bed
 #  16180919 mm9Brain2.multi.bed
 #     54100 mm9Brain2.spike.bed
 #   1570776 mm9Brain2.splices.bed
 #  26519333 mm9Brain2.uniqs.bed
 #  12794917 mm9Liver1.multi.bed
 #   1030969 mm9Liver1.splices.bed
 #  13133048 mm9Liver1.uniqs.bed
 #  17783124 mm9Liver2.multi.bed
 #    414618 mm9Liver2.spike.bed
 #   1372984 mm9Liver2.splices.bed
 #  17673014 mm9Liver2.uniqs.bed
 #  12048985 mm9Muscle1.multi.bed
 #   1150895 mm9Muscle1.splices.bed
 #  13936012 mm9Muscle1.uniqs.bed
 #  16033642 mm9Muscle2.multi.bed
 #    589787 mm9Muscle2.spike.bed
 #   1347749 mm9Muscle2.splices.bed
 #  16632816 mm9Muscle2.uniqs.bed
 # 194481357 total
 
 # Just do the splices ones
 for T in Brain Liver Muscle
 do
   for R in 1 2
   do
   egrep -v "^track" mm9${T}${R}.splices.bed | gawk -v OFS="\t" '{print $1,$2,$3,$4,$5,$6,$2,$3,0,$10,$11,$12}' > woldRnaSeqSplices${T}${R}.bed
   hgLoadBed mm9 woldRnaSeqSplices${T}${R} woldRnaSeqSplices${T}${R}.bed
   done
 done
 rm bed.tab
 
 
 #########################################################################
 # KOMP/IKMC (KNOCKOUT MOUSE PROJECT became Int'l Knockout Mouse Cons) (DONE 12/8/09 angie)
 # done 7/24/09 w/files emailed from Carol 7/24
 # done 5/7/09 w/files emailed from Carol Bult 5/7
 # done 2/12/09 w/files emailed from Carol Bult 2/12
 # done 10/21/08 w/files emailed from Carol Bult 10/18
     ssh hgwdev
     mkdir -p /hive/data/genomes/mm9/bed/komp/2009_12
     cd /hive/data/genomes/mm9/bed/komp/2009_12
     # Save files emailed from Carol Bult 12/7 as 
     # 20091204_ikmc.gff.gz
     # Make bed12 with itemRgb:
     zcat 20091204_ikmc.gff.gz \
     | perl -we \
       'while (<>) { \
          s/\r?\n$//; \
          ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
          if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
          $col = ($col eq "Yellow") ? "255,215,0" : \
                 ($col eq "Green")  ? "0,240,0" : \
                 ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
          $s--; \
          $id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
          my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
          push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
       } \
       warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
       foreach my $geneId (keys %geneBlks) { \
          my @blks = @{$geneBlks{$geneId}}; \
          my ($chrom, $center, $name) = split(/\|/, $geneId); \
          my $blkCount = @blks; \
          @blks = sort {$a->[0] <=> $b->[0]} @blks; \
          my $chromStart = $blks[0]->[0]; \
          my $chromEnd = $blks[$blkCount-1]->[1]; \
          my $color = $blks[0]->[2]; \
          my $blkStarts = ""; \
          my $blkSizes = ""; \
          foreach my $blk (@blks) { \
            my ($start, $end, $col) = @{$blk}; \
            $blkStarts .= ($start - $chromStart) . ","; \
            $blkSizes  .= ($end - $start) . ","; \
            if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
          } \
         print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                    $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
       }' \
     | sort -k 1,1 -k 2n,2n > komp.bed
 #Got 36359 genes.
     # No stderr empty-coord warnings this time (no unmapped items).
     # Make an alias-style table with associated info (MGI ID and status):
     zcat 20091204_ikmc.gff.gz \
     | perl -wpe 's/\r?\n$//; @w = split("\t"); \
       if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
       if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
       $w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
       ($mgi, $designId, $status) = ($1, $2, $3); \
       $_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' \
     | sort -u > kompExtra.tab
     wc -l kompExtra.tab
 #36359 kompExtra.tab
     # Load 'em up:
     hgLoadBed mm9 komp komp.bed
 #Loaded 32185 elements of size 12
     hgLoadSqlTab mm9 kompExtra $HOME/kent/src/hg/lib/genericAlias.sql kompExtra.tab
     checkTableCoords -verbose=2 mm9 komp
 #mm9.komp item Tekt3_41479 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41478 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41477 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Tekt3_41476 chr11:62887195-62896116: blocks 3 and 4 overlap.
 #mm9.komp item Cntn5_44827 chr9:10008998-10019351: blocks 1 and 2 overlap.
     # Carol talked to the Sanger folks about those... pls waive.
 
     # Note from July '09: Carol noticed some very long items and is asking
     # Sanger about them.  Here's how to check it ourselves next time:
     hgsql mm9 -e 'select name, (chromEnd-chromStart) as length from komp \
                   where chromEnd - chromStart > 1000000 order by length desc;'
 #+----------------------+----------+
 #| name                 | length   |
 #+----------------------+----------+
 #| Ankrd22_67616        | 51920750 | 
 #| Ptprd_VG12763        |  2270723 | 
 #| Macrod2_VG12650      |  1997658 | 
 #| A430089I19Rik_71812  |  1814706 | 
 #| 1700049E17Rik2_68957 |  1596021 | 
 #| Pcdh15_VG15967       |  1550393 | 
 #| Gpc5_VG15750         |  1431812 | 
 #| Lrrc4c_VG10110       |  1313498 | 
 #| Agbl4_VG16439        |  1266664 | 
 #| Prkg1_VG15918        |  1197272 | 
 #| Ptprt_VG10147        |  1139158 | 
 #| Ccl21b_67667         |  1019106 | 
 #+----------------------+----------+
 
     runJoiner.csh mm9 komp
 # mm9.kompExtra.name - hits 36359 of 36359 ok
 
 
 #########################################################################
 ### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)
     # Align probes from MOE430v2 chip.
     #	Data was picked up manually from the Affymetrix WEB site
     #	while logged in to the Affymetrix system, from the page:
 # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430-20
     # found links to the following files:
 -rw-r--r--  1  51429336 Dec  1  2003 Mouse430_2.probe_fasta
 -rw-r--r--  1    163849 Dec  2  2003 Mouse430_2_control
 -rw-r--r--  1  89662619 Dec  2  2003 Mouse430_2.consensus
 -rw-r--r--  1  30999528 Dec  2  2003 Mouse430_2.target
 -rw-r--r--  1  24828845 Jun 12  2006 Mouse430_2.link.psl
 -rw-r--r--  1 119301329 Aug 18  2006 Mouse430_2_ortholog.csv
 -rw-rw-rw-  1  95467111 Jul  7 22:05 Mouse430_2.na26.annot.csv
 -rw-r--r--  1      3188 Jul  8 13:23 3prime-IVT.AFFX_README.NetAffx-CSV-Files.txt
     #	placed into: /hive/data/genomes/mm9/bed/affyMOE430v2/affyData
 
     #	The GNF folks pointed to data available at:
     #	http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE10246
 
     ssh memk
 #     cat ../affyData/Mouse430_2.probe_fasta \
 # 	| sed -e "s/probe:Mouse430_2:/MOE320v2_/; s/:.*//" > MOE430v2_probes.fa
 #     cat ../affyData/Mouse430_2.target \
 # 	| sed -e "s/target:Mouse430_2:/MOE320v2_/; s/;.*//" > MOE430v2_target.fa
     mkdir /hive/data/genomes/mm9/bed/affyMOE430v2/run
     cd /hive/data/genomes/mm9/bed/affyMOE430v2/run
     mkdir psl
 
     cut -f1 ../../../chrom.sizes > genome.list
     cat ../affyData/Mouse430_2.consensus \
 	| sed -e "s/consensus:Mouse430_2://; s/;.*//" > affyMOE430v2.fa
 
     ls -1 /hive/data/genomes/mm9/bed/affyMOE430v2/run/affyMOE430v2.fa \
 	> probe.list
 
     cat << '_EOF_' > template
 #LOOP
 blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 genome.list probe.list template jobList
     para create jobList
     para try ... check ... push ... etc.
     para time
 # Completed: 35 of 35 jobs
 # CPU time in finished jobs:      22222s     370.36m     6.17h    0.26d  0.001 y
 # IO & Wait Time:                   104s       1.74m     0.03h    0.00d  0.000 y
 # Average job time:                 638s      10.63m     0.18h    0.01d
 # Longest finished job:            1580s      26.33m     0.44h    0.02d
 # Submission to last job:          1589s      26.48m     0.44h    0.02d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
 	../affyMOE430v2.psl /dev/null
 
     # Load probes and alignments from MOE430v2 into database.
     ssh hgwdev
     cd /hive/data/genomes/mm9/bed/affyMOE430v2
     mkdir /projects/compbio/data/microarray/affyMOE430v2
     cp -p run/affyMOE430v2.fa /projects/compbio/data/microarray/affyMOE430v2
 
     ln -s /projects/compbio/data/microarray/affyMOE430v2/affyMOE430v2.fa \
 	/gbdb/hgFixed/affyProbes
 
     hgLoadPsl mm9 affyMOE430v2.psl
     hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/affyMOE430v2.fa
     #	45037 sequences
     pslToBed affyMOE430v2.psl affyMOE430v2Probes.bed
     hgLoadBed -tmpDir=/scratch/tmp mm9 affyMOE430v2Probes affyMOE430v2Probes.bed
     Loaded 46193 elements of size 12
     # this is temporary, for use with bedMergeExpData below
 
     #	Create a similar formatted file to the one used in MOE430
     zcat geoData/GSE10246_series_matrix.txt.gz \
 	| egrep "^\"1|source_name|Sample_title" \
 	| sed -e "s/\!Sample_title/#Probe Set/; s#\!Sample_source_name_ch1##;" \
 	| sed -e "s/\"//g" > gnfMOE430v2.AD.txt
 
     #	create gnfMouseAtlas3AllExps and gnfMouseAtlas3All tables in hgFixed
     hgGnfMicroarray gnfMouseAtlas3AllExps gnfMouseAtlas3All \
 	gnfMOE430v2.AD.txt -chip=affyMOE430v2
     #	182 experiments
     #	from that table, create median ratio table
     # create table gnfMOE430v2AllRatio in hgFixed from hgFixed.gnfMOE430v2All
     #	and classification file ../hgMedianMicroarray/gnfMOE430v2.ra
     hgRatioMicroarray gnfMouseAtlas3All gnfMouseAtlas3AllRatio \
 	-clump=$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra
 
     # add those ratio's to the probe locations to make a bed 15 microarray type
     bedMergeExpData hgFixed.gnfMouseAtlas3AllRatio mm9.affyMOE430v2Probes \
 	gnfMouseAtlas3AllRatio.bed
     #	no longer need this table
     #	do not need this table for the genome browser display
     hgsql -e "drop table affyMOE430v2Probes;" mm9
 
     hgLoadBed mm9 gnfMouseAtlas3 gnfMouseAtlas3AllRatio.bed
 
     hgMapToGene mm9 gnfMouseAtlas3 knownGene \
 	knownToGnfMouseAtlas3 '-type=bed 12'
 
     time hgExpDistance mm9 hgFixed.gnfMouseAtlas3AllRatio \
 	hgFixed.gnfMouseAtlas3AllExps gnfMouseAtlas3Distance \
 	-lookup=knownToGnfMouseAtlas3
     #	Have 45036 elements in hgFixed.gnfMouseAtlas3AllRatio
     #	Got 39872 unique elements in hgFixed.gnfMouseAtlas3AllRatio
 
     #	Loaded gnfMouseAtlas3Distance
     #	real    34m56.844s
     #	user    58m1.892s
     #	sys     1m44.821s
 
     # Take the median value over multiple replicants creating
     # hgFixed.gnfMouseAtlas3MedianRatio and gnfMouseAtlas3MedianExps
     cd ../hgMedianMicroarray
     hgMedianMicroarray hgFixed gnfMouseAtlas3AllRatio gnfMouseAtlas3AllExps \
 	$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
 	gnfMouseAtlas3MedianRatio gnfMouseAtlas3MedianExps -minExps=1
 
     # Also make a median version of the absolute measurements
     hgMedianMicroarray hgFixed gnfMouseAtlas3All gnfMouseAtlas3AllExps \
 	$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
 	gnfMouseAtlas3AllMedian gnfMouseAtlas3AllMedianExps -minExps=1
 
     time hgExpDistance mm9 hgFixed.gnfMouseAtlas3MedianRatio \
 	hgFixed.gnfMouseAtlas3MedianExps gnfMouseAtlas3MedianDistance \
 	-lookup=knownToGnfMouseAtlas3
 # Have 45037 elements in hgFixed.gnfMouseAtlas3MedianRatio
 # Got 39872 unique elements in hgFixed.gnfMouseAtlas3MedianRatio
 XXX - working Mon Nov 24 10:01:43 PST 2008
 
     #	real    16m5.102s
     #	user    41m54.581s
     #	sys     1m28.595s
 
 
     #	182 experiments
     # Convert these to ratios using the median of medians of non-cancerous
     # cell types as the denominator as so:
     cd ~/src/hg/makeDb/hgRatioMicroarray
     cd ../hgMedianMicroarray
     # create tables gnfMOE430v2MedianRatio gnfMOE430v2MedianExps in hgFixed
     hgMedianMicroarray hgFixed gnfMOE430v2AllRatio gnfMOE430v2AllExps \
 	gnfMOE430v2.ra gnfMOE430v2MedianRatio gnfMOE430v2MedianExps -minExps=1
 
     # Also make a median version of the absolute measurements
     #	create gnfMOE430v2Median
     hgMedianMicroarray hgFixed gnfMOE430v2All gnfMOE430v2AllExps \
 	gnfMOE430v2.ra gnfMOE430v2Median gnfMOE430v2MedianExps -minExps=1
 
     cd /hive/data/genomes/mm9/bed/affyMOE430v2
     # Load up microarray track
     hgMapMicroarray gnfMOE430v2.bed hgFixed.gnfMOE430v2MedianRatio \
     	affyMOE430v2.psl
     #	Loaded 45037 rows of expression data from hgFixed.gnfMOE430v2MedianRatio
     #	Mapped 44106,  multiply-mapped 2087, missed 0, unmapped 931
 
     hgLoadBed mm9 gnfMOE430v2 gnfMOE430v2.bed
     #	Loaded 46193 elements of size 15
 
 #######################################
     hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
 	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
 # Convert these to ratios using the median of medians of non-cancerous
 # cell types as the denominator as so:
 cd ~/src/hg/makeDb/hgRatioMicroarray
 hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
 # Take the median value over multiple replicants and put in this table:
 cd ../hgMedianMicroarray
 hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
 # Also make a median version of the absolute measurements
 hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1
 
 ############################################################################
 # hgPal downloads
     ssh hgwdev
     screen
     bash
     rm -rf /cluster/data/mm9/bed/multiz30way/pal
     mkdir /cluster/data/mm9/bed/multiz30way/pal
     cd /cluster/data/mm9/bed/multiz30way/pal
     cat > order.lst <<EOF
     mm9
     rn4
     cavPor2
     oryCun1
     hg18
     panTro2
     rheMac2
     ponAbe2
     calJac1
     otoGar1
     tupBel1
     sorAra1
     eriEur1
     canFam2
     felCat3
     equCab1
     bosTau3
     dasNov1
     loxAfr1
     echTel1
     monDom4
     ornAna1
     galGal3
     anoCar1
     xenTro2
     gasAcu1
     danRer5
     tetNig1
     fr2
     oryLat1
 EOF
 
     mz=multiz30way
     gp=refGene
     db=mm9
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.jobs
 
     time sh -x $gp.jobs > $gp.jobs.log 2>&1 & 
     sleep 1
     tail -f $gp.jobs.log
 
 # real    196m7.752s
 # user    11m26.917s
 # sys     3m41.587s
 
     zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     # we're only distributing exons at the moment
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     mz=multiz30way
     gp=knownGene
     db=mm9
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
 	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
 	    gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & 
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    216m43.721s
 # user    18m33.552s
 # sys     5m42.639s
 
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
     # now do the canonical set
     cd /cluster/data/mm9/bed/multiz30way/pal
     mz=multiz30way
     gp=knownCanonical
     db=mm9
     for j in `awk '{print $1}' /cluster/data/mm9/chrom.sizes`
     do
 	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
     done
 
     mkdir exonAA exonNuc ppredAA ppredNuc
     for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
     do
 	echo "date"
 	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | gzip -c > ppredAA/$j.ppredAA.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | gzip -c > exonNuc/$j.exonNuc.fa.gz"
 	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | gzip -c > exonAA/$j.exonAA.fa.gz"
     done > $gp.$mz.jobs
 
     time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & 
     sleep 1
     tail -f $gp.$mz.job.log
 
 # real    192m17.168s
 # user    10m28.659s
 # sys     3m53.467s
 
     rm *.known.bed
     zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
     zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
     zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
     zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz
 
     rm -rf exonAA exonNuc ppredAA ppredNuc
 
     db=mm9
     mz=multiz30way
     gp=knownCanonical
     pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
 
 
 #############################################################################
 # MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)  
 # (to build the affyExonTissues track, see the steps outlined in hg18.txt)
 #############################################################################
 
 ########################################################################
 ## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
 ## (instructions are in hg18.txt)
 ########################################################################
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 mm9.upstreamGeneTbl = refGene
 mm9.upstreamMaf = multiz30way /hive/data/genomes/mm9/bed/multiz30way/species.list
 
 
 #############################################################################
 # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08)
     ssh hgwdev
     mkdir /cluster/data/mm9/bed/mrnaPcr
     cd /cluster/data/mm9/bed/mrnaPcr
     genePredToBed /cluster/data/mm9/bed/ucsc.10/ucscGenes.gp > ucscGenes.bed
     hgsql mm9 -NBe 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
     subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
     sequenceForBed -keepName -db=mm9 -bedIn=ucscGenesIdSubbed.bed \
       -fastaOut=stdout \
     | faToTwoBit stdin kgTargetSeq.2bit
     cut -f 1-10 /cluster/data/mm9/bed/ucsc.10/ucscGenes.gp \
     | genePredToFakePsl mm9 stdin kgTargetAli.psl /dev/null
 
     # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
     cd /cluster/data/mm9/bed/mrnaPcr
     hgLoadPsl mm9 kgTargetAli.psl
     mkdir /gbdb/mm9/targetDb
     ln -s /cluster/data/mm9/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm9/targetDb/
 
     # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
     # /gbdb/mm9/targetDb/kgTargetSeq.2bit .
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("mm9Kg", "blat13", 17805, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("mm9Kg", "UCSC Genes", \
          "mm9", "kgTargetAli", "", "", \
          "/gbdb/mm9/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
 
 #############################################################################
 # TEST BLASTZ with Rn5 (DONE - 2008-11-26,30 - Hiram)
     mkdir /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
     cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
 
     cat << '_EOF_' > DEF
 # mouse vs rat
 # Specially tuned blastz parameters from Webb Miller
 
 BLASTZ=blastz
 BLASTZ_ABRIDGE_REPEATS=0
 BLASTZ_O=600
 BLASTZ_E=55
 BLASTZ_Y=15000
 BLASTZ_T=2
 BLASTZ_K=4500
 BLASTZ_Q=/scratch/data/blastz/mouse_rat.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn5
 SEQ2_DIR=/scratch/data/rn5/rn5.2bit
 SEQ2_LEN=/scratch/data/rn5/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-stop=net `pwd`/DEF > do.log 2>&1 &
     #	real    403m22.371s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-debug -chainMinScore=5000 -chainLinearGap=medium \
 	-continue=load -stop=load `pwd`/DEF > load.log 2>&1 &
     #	real    44m59.528s
     cat fb.mm9.chainRn5BlastzLink.txt
     #	1751593467 bases of 2620346127 (66.846%) in intersection
     cat /cluster/data/mm9/bed/blastzRn4.2007-08-31/fb.mm9.chainRn4Link.txt
     #	1713186474 bases of 2620346127 (65.380%) in intersection
 
     mkdir /hive/data/genomes/rn5/bed/blastz.mm9.swap
     cd /hive/data/genomes/rn5/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-swap -stop=net > swap.log 2>&1 &
     #	real    63m51.690s
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
 	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=medium \
 	-debug -swap -continue=load -stop=load > load.log 2>&1 &
     cat fb.rn5.chainMm9BlastzLink.txt
     #	1901280009 bases of 3372561689 (56.375%) in intersection
 
 #############################################################################
 # AFFY EXON PROBE LIFT MM8->MM9 (DONE, 2008-12-17 Andy)
     ssh hgwdev
     cd /hive/data/genomes/mm9/bed
     mkdir affyMoEx1
     cd affyMoEx1/
     echo "select * from affyMoEx1Probe" | \
        hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Probe.bed
     liftOver mm8.affyMoEx1Probe.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
        affyMoEx1Probe.bed unmapped.txt
     grep Partially unmapped.txt | wc -l
 #199
     grep Split unmapped.txt | wc -l
 #190
     grep Deleted unmapped.txt | wc -l
 #354
     wc -l mm8.affyMoEx1Probe.bed
 #4549897
     ## Out of 4.5 million probes in mm8, we've lost 743 in different ways
     ## attempting to lift.  That's an acceptable number.
     hgLoadBed mm9 affyMoEx1Probe{,.bed}
     echo "select * from affyMoEx1Transcript" | \
        hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Transcript.bed
     liftOver mm8.affyMoEx1Transcript.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
        affyMoEx1Transcript.bed unmapped.txt
     hgLoadBed mm9 affyMoEx1Transcript{,.bed}
     ## Put unlifted IDs into a downloadable file.
     mkdir /usr/local/apache/htdocs/goldenPath/mm9/unlifted
     grep -A1 Deleted unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Deleted.bed
     grep -A1 Partially unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8PartiallyDeleted.bed
     grep -A1 Split unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Split.bed
     grep -A1 Deleted unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8Deleted.bed
     grep -A1 Partially unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8PartiallyDeleted.bed
     cp affyMoEx1*.mm8*.bed /usr/local/apache/htdocs/goldenPath/mm9/unlifted
     ## mm8 and mm9 track descriptions differ: 
     ## 1. Copy mouse/trackDb.ra setting to mouse/mm9/trackDb.ra and add
     ##    origAssembly mm8 line.
     ## 2. Make a new paragraph in a new affyMouseExon.html in mm9 to include
     ##    details about the lift and how many didn't lift.
 
 #############################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2009-04-07)
     # bash  if not using bash shell already
     ssh kolossus
     mkdir /cluster/data/mm9/blastDb
     cd /cluster/data/mm9
     awk '{if ($2 > 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > 1meg.lst
     twoBitToFa -seqList=1meg.lst  mm9Chroms_RandomContigs.hard.2bit temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > less1meg.lst
     twoBitToFa -seqList=less1meg.lst  mm9Chroms_RandomContigs.hard.2bit temp.fa
     faSplit about temp.fa 1000000 blastDb/y 
 
     cd blastDb
     for i in *.fa
     do
 	/hive/data/outside/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 2712
 
     mkdir -p /cluster/data/mm9/bed/tblastn.hg18KG
     cd /cluster/data/mm9/bed/tblastn.hg18KG
     echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 
 # 2712 query.lst
 
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)
 
 # 36727/(250000/2712) = 398.414496
 
    mkdir -p kgfa
    split -l 398 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/mm9/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/hive/data/outside/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm9/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit 
     
     ssh swarm
     cd /cluster/data/mm9/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 
 # Completed: 252216 of 252216 jobs
 # CPU time in finished jobs:   14882096s  248034.93m  4133.92h  172.25d  0.472 y
 # IO & Wait Time:               1019014s   16983.57m   283.06h   11.79d  0.032 y
 # Average job time:                  63s       1.05m     0.02h    0.00d
 # Longest finished job:             184s       3.07m     0.05h    0.00d
 # Submission to last job:         15667s     261.12m     4.35h    0.18d
 
     ssh swarm
     cd /cluster/data/mm9/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS ../blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 93 of 93 jobs
 # CPU time in finished jobs:       5736s      95.59m     1.59h    0.07d  0.000 y
 # IO & Wait Time:                 21289s     354.82m     5.91h    0.25d  0.001 y
 # Average job time:                 291s       4.84m     0.08h    0.00d
 # Longest finished job:             472s       7.87m     0.13h    0.01d
 # Submission to last job:           496s       8.27m     0.14h    0.01d
 
 
     cd /cluster/data/mm9/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl
     cd ..
     pslCheck unliftBlastHg18KG.psl
     liftUp -nohead temp.psl ../../jkStuff/mm9.contigs.lift carry unliftBlastHg18KG.psl 
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl  > blastHg18KG.psl
     rm temp.psl
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/mm9/bed/tblastn.hg18KG
     hgLoadPsl mm9 blastHg18KG.psl
 
     # check coverage
     featureBits mm9 blastHg18KG 
 # 30285278 bases of 2620346127 (1.156%) in intersection
 
     featureBits mm9 knownGene:cds blastHg18KG  -enrichment
 # knownGene:cds 1.278%, blastHg18KG 1.156%, both 0.969%, cover 75.86%, enrich  65.64x
 
     featureBits mm9 refGene:cds blastHg18KG  -enrichment
 # refGene:cds 1.205%, blastHg18KG 1.156%, both 0.940%, cover 78.04%, enrich 67.52x
 
     rm -rf blastOut
 #end tblastn
 
 #############################################################################
 # LASTZ Swap Human Hg19 (DONE - 2009-05-14 - Hiram)
     #	the original
     cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
     cat fb.hg19.chainMm9Link.txt 
     #	1022734273 bases of 2897316137 (35.299%) in intersection
 
     #	and the swap
     mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
     cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
 	-swap -noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    131m58.763s
     cat fb.mm9.chainHg19Link.txt 
     #	1013880568 bases of 2620346127 (38.693%) in intersection
 
 #############################################################################
 #  RE-BUILD miRNA TRACK (DONE, 2009-06-09-2009-06-11, hartera)
     # The miRNA track from miRBase is out of date so update the track. 
     mkdir -p /hive/data/genomes/mm9/bed/miRNA-2009-06-09
     cd /hive/data/genomes/mm9/bed/miRNA-2009-06-09
     # Download GFF file of latest miRNA annotations from miRBase at the
     # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0.
     # (March 2009)
     wget --timestamping \
 ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/mmu.gff
     # Re-format, need to add "chr" to the beginning of each line.
     sed -e 's/^/chr/' mmu.gff > mmMirBaseFormat.gff
     # Remove extra "chr" in comment lines
     perl -pi.bak -e 's/chr#/#/' mmMirBaseFormat.gff
     # Change chrMT to chrM
     perl -pi.bak -e 's/chrMT/chrM/' mmMirBaseFormat.gff
     # Remove all but ID name in last field
     sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"/transcript_id=/g' \
        | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff
 
     # Load into database. 
     ldHgGene -exon=miRNA mm9 miRNARel13 mmMirBaseFormatIdOnly.gff
     # Does not load as mmu-mir-692-2 is on two chroms, chr4 and chr13.
     # These are alignments not genePreds so convert to BED for loading into
     # the database.
     sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"//g' \
        | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff
     # chr1    .       miRNA   20669091        20669163        .       +
     # .       mmu-mir-206
     # use score 906 for + strand and 480 for - strand. This will show 
     # up black on the track for + strand and grey for - strand.
     # Re-do below and re-load track as appears off by 1 compared to 
     # Ensembl track and other miRNA resources (2009-06-11)
     # Confirmed with Sam Griffith-Jones that the coordinates in the 
     # GFF file are 1-based. (2009-06-12).
     awk 'BEGIN {FS="\t"} {OFS="\t"} \
         {if ($0 !~ /#/ && $7 == "+") print $1, $4-1, $5, $9, 960, $7; \
        else if ($0 !~ /#/ && $7 == "-") print $1, $4-1, $5, $9, 480, $7;}' \
         mmMirBaseFormatIdOnly.gff > mmMirBaseFormatIdOnly.bed
     # Remove previous table
     hgsql -e 'drop table miRNA' mm9
     hgLoadBed mm9 miRNA mmMirBaseFormatIdOnly.bed
 # Reading mmMirBaseFormatIdOnly.bed
 # Loaded 568 elements of size 6
 # Sorted
 # Creating table definition for miRNARel13
 # Saving bed.tab
 # Loading mm9
     hgsql -e 'select count(*) from miRNA;' mm9 
 # 568
 # The previous version had 493 miRNAs.
 hgsql -e 'select count(distinct name) from miRNA;' mm9
 # 541
 # The previous version had 466 unique miRNAs. 
 
 ############################################################################
 # Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram
     mkdir /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
     cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
 
     cat << '_EOF_' > DEF
 # Mouse vs. Horse
 
 BLASTZ_M=50
 
 # TARGET: Mouse MM9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Horse
 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
 SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
 SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
 SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
 SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl `pwd`/DEF \
 	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    360m10.094s
     time doBlastzChainNet.pl `pwd`/DEF \
 	-continue=chainMerge -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
     #	real    225m4.178s
     cat fb.mm9.chainEquCab2Link.txt 
     #	912421053 bases of 2620346127 (34.821%) in intersection
 
     mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap
     cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap
     time doBlastzChainNet.pl \
 	/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \
 	-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real  122m25.314s
     cat fb.equCab2.chainMm9Link.txt 
     #	902295813 bases of 2428790173 (37.150%) in intersection
 
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
 # VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-30 - 2009-09-09, hartera)
 # Needs updating as the current version is build 31 from May 2008.
 # 2009-08-03 (hartera) - Added code to register track handler for
 # vegaGeneComposite.
 # 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
 # on the configuratio page for the track item labels. Modified code so it 
 # can be shared with Ensembl to create the links to Vega transcript, gene
 # and protein reports on the details pages. 
 # 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
 # Loaded the vegaGtp table.
 # 2009-09-01 and 2009-09-03 (hartera). Loaded a vegaPep table for the protein
 # sequence link on the details pages.
 # 2009-09-04 Re-load all tables as some reverted to the older version during
 # mySQL 5 upgrade.
 # 2009-09-08 - 2009-09-09 Code change to change message on details page when 
 # no protein is available and change to trackDb to make vegaGene items a 
 # darker blue colour. Reloaded vegaPep after removing proteins whose
 # transcripts are not in vegaGtp to make all.joiner happy.
 
    mkdir -p /hive/data/genomes/mm9/bed/vega35
    cd /hive/data/genomes/mm9/bed/vega35
    # Download the VEGA genes for mouse from the ftp site
    # This file is from 03/17/09.
    wget --timestamping \
         "ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
    # add chr in front of chromosome names and lift up the randoms
    #    processing similar to the same processing for Ensembl genes,
    #    from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
    cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
    zcat gtf_file.gz \
         | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
         | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
         | gzip > allGenes.gtf.gz
    # Got 189 lifts in randoms.mm9.lift
 
    gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
 	| gzip > mm9.allGenes.gp.gz
    /cluster/home/hartera/kent/src/hg/utils/automation/extractGtf.pl \
 	infoOut.txt > ensGtp.tab
    genePredCheck -db=mm9 mm9.allGenes.gp.gz
    # checked: 59381 failed: 0
    zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
    zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
    
    # Modify the GTF files so that the gene name goes into the 
    # name2 field of the genePred. 
    perl -pi.bak -e 's/gene_id/other_gene_id/' *pseudo.gtf
    perl -pi.bak -e 's/gene_name/gene_id/' *pseudo.gtf
    gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
    gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
 
    genePredCheck -db=mm9 pseudo.gp
     # checked: 4305 failed: 0§
    genePredCheck -db=mm9 not.pseudo.gp
     # checked: 55076 failed: 0
 
    hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
 
    # clean up
    rm *.bak
 
    # 2009-08-03 (hartera)
    # Added code to src/hg/hgTracks/simpleTracks.c to register a track
    # handler for vegaGeneComposite that is now used for this data. This used
    # vegaGeneMethods to display the name2 field (gene) as the item label in
    # the track.
 
    # 2009-08-15 - 2009-08-16 (hartera)
    # Information extracted the attributes in the GTF file as ensGtp so 
    # change name to vegaGtp.
    mv ensGtp.tab vegaGtp.tab
    # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
    # There is an index on the protein field so it can not be NULL. 
    # If there is no protein, the gene name is given.
    # Added code to hgTracks.c and hgTrackUi.c to allow the use of 
    # radio buttons on the track configuratioin page to select the
    # gene name, accession or both to be displayed in the track.
    # The gene name is displayed by default.
    # Added code to hgc.c so that Ensembl and Vega can share code to 
    # create links on the details pages to the Vega reports for transcript, 
    # gene and protein through these IDs. Created new function
    # printEnsemblOrVegaCustomUrl(). 
 
    # 2009-08-22 (hartera)
    # Loaded the vegaGtp table. Use ensGtp.sql to create the table.
    # vegaGtp associates geneId/transcriptId/proteinId 
    # for the links to Vega reports from the details page.
    cd /hive/data/genomes/mm9/bed/vega35
    cp ~/kent/src/hg/lib/ensGtp.sql .
    # 11 of the gene names for noncoding transcripts are too long for the 
    # protein ID field so change this field in ensGtp.sql to allow 40 chars 
    # instead of 20 and re-load the table.
    hgsql -e 'drop table vegaGtp;' mm9
    hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
    # Loaded succesfully
    # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in 
    # doVegaGene() to add the links to Vega reports on the details pages.
    # Code was added so that there is no protein sequence link on the details
    # page if it there is none available e.g. noncoding.
    # 2009-09-01 (hartera)
    # Coding genes are displaying the message that there is no protein
    # prediction available. Need to add a vegaPep table.
    cd /hive/data/genomes/mm9/bed/vega35
    # Download the protein FASTA file for Vega35
    wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/*.tot.fa.gz"
    # from the Ensembl process:
    zcat Mus_musculus.VEGA.mar.pep.tot.fa.gz  \
        | sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
    zcat vegaPep.txt.gz \
        | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
        | sed -e '/^$/d; s/*$//' | sort > vegaPep.mm9.fa.tab
    # Load table (2009-09-03, hartera)
    hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
    # Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track 
    # in the type line for src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra.
    # Check that the vegaPep table looks ok and then check protein-coding and 
    # noncoding transcript details pages for protein links.
    
    # 2009-09-04, hartera
    # Re-load tables after upgrade to mySQL 5 as they had reverted back to 
    # tables with the previous Vega dataset.
 
    cd /hive/data/genomes/mm9/bed/vega35
    hgsql -e 'drop table vegaGene;' mm9
    hgsql -e 'drop table vegaPseudoGene;' mm9
    hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
    hgsql -e 'drop table vegaGtp;' mm9
    hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
    hgsql -e 'drop table vegaPep;' mm9
    hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
    # 2009-09-08 (hartera). Changed message in code for details page when no
    # protein sequence is available to be more explanatory. "Non-protein
    # coding gene or gene fragment, no protein prediction available." Changed
    # the colouring for the vegaGene subtrack to be darker blue so there is 
    # more of a contrast between vegaGene and vegaPseudoGene subtracks.
 
    # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins
    # that have a transcript ID in vegaGtp. 
    # all.joiner is complaining as there are about 1,000 extra proteins in 
    # vegaPep that do not have transcripts in vegaGtp. Decided to remove these
    # and e-mailed the HAVANA group to ask about the discrepancy. 
    cd /hive/data/genomes/mm9/bed/vega35
    awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids
    awk '{print $1}' vegaPep.mm9.fa.tab | sort | uniq > vegaPep.tx.ids
    wc -l *.tx.ids
    # 59381 vegaGtp.tx.ids
    # 30956 vegaPep.tx.ids
    
    # Number of transcripts that have a protein ID:
    hgsql -Ne 'select transcript from vegaGtp where protein like "OTTMUSP%";' \
         mm9 | sort | uniq > vegaGtpWithProt.tx.ids
    wc -l vegaGtpWithProt.tx.ids        
    # 29902 vegaGtpWithProt.tx.ids
  
    # find those that are common to both. 
    comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids
    wc -l pepandGtp.tx.ids 
    # 29902 pepandGtp.tx.ids
    comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l 
    # 29902
    # Therefore all the vegaGtp transcripts with a protein ID are in the
    # protein FASTA file.
    hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \
          like "OTTMUSP%" and p.name = g.transcript;' mm9 \
          > vegaPepOnlyInGtp.mm9.fa.tab
    wc -l vegaPepOnlyInGtp.mm9.fa.tab 
    # 29902 vegaPepOnlyInGtp.mm9.fa.tab  
    hgsql -e 'drop table vegaPep;' mm9
    hgPepPred mm9 tab vegaPep vegaPepOnlyInGtp.mm9.fa.tab
     
 ############################################################################
 # Blastz Elephant loxAfr3 (DONE - 2009-08-12 - Hiram)
     mkdir /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
     cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
 
     cat << '_EOF_' > DEF
 # Mouse vs. Elephant
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Elephant loxAfr3
 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
 SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm9/bed/lastzLoxAfr3.2009-08-12
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real 498m44.261s
     cat fb.mm9.chainLoxAfr3Link.txt
     #	684326090 bases of 2620346127 (26.116%) in intersection
 
     #	trying syntenic nets
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
     #	about 20 minutes
 
     mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
     cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \
 	-swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	-syntenicNet > swap.log 2>&1 &
     #	real    123m9.342s
     cat fb.loxAfr3.chainMm9Link.txt 
     #	673856452 bases of 3118565340 (21.608%) in intersection
 
 #########################################################################
 ## NIA Mouse Gene Index - (DONE, Fan, 9/9/09)
+# NOTE FOR NEXT TIME: this track fails pslCheck because every row in the
+# NIAGene table has a tSize of 198000000.  Future tables should contain the
+# proper chromosome lengths in the tSize field.  (Brooke, 2/22/10)
     ssh hgwdev 
     mkdir -p /cluster/data/mm9/bed/NIAGene090903
     cd /cluster/data/mm9/bed
     ln -s NIAGene090903 NIAGene
     cd NIAGene
     wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-fasta.ff.gz
     wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-psl.txt.gz
     gzip -d *.gz
     
     cut -f 1-21 T-psl.txt >NIAGene.tab
     hgLoadPsl mm9 NIAGene.tab
 
     mkdir /gbdb/mm9/NIAGene
     ln -s /cluster/data/mm9/bed/NIAGene/T-fasta.fa /gbdb/mm9/NIAGene/T-fasta.fa
     
     hgLoadSeq mm9 /gbdb/mm9/NIAGene/T-fasta.fa
 
 #Creating seq.tab file
 #Adding /gbdb/mm9/NIAGene/T-fasta.fa
 #257758 sequences
 #Updating seq table
 #Warning: load of seq did not go as planned: 257758 record(s), 0 row(s) skipped, 257758 warning(s) loading ./seq.tab
 #Advisory lock has been released
 #All done
 
 # not sure what the warnings are about, but the track seems working.
 
 # Create/edit/check in NIAGene.html and trackDb.ra under
     
         kent/src/hg/makeDb/trackDb/mouse/mm9
 
 #####################################################################
 # LASTZ Tetraodon TetNig2 (DONE - 2009-09-15 - Hiram)
     mkdir /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
     cd /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
 
     cat << '_EOF_' > DEF
 # mouse vs tetraodon
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/scratch/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=5
 
 # QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
 SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
 SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
 SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=50
 
 BASE=/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	about 124 minutes
     cat fb.mm9.chainTetNig2Link.txt 
     #	45642112 bases of 2620346127 (1.742%) in intersection
 
     #	running the swap
     mkdir /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
     cd /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15/DEF \
 	-qRepeats=windowmaskerSdust \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap > swap.log 2>&1 &
     #	real    10m34.797s
     cat fb.tetNig2.chainMm9Link.txt 
     #	41176381 bases of 302314788 (13.620%) in intersection
 
 ##############################################################################
 # BUILD REST TRACK (DONE 9/16/09, Fan)
 
     mkdir /hive/data/genomes/mm9/bed/REST
     cd /hive/data/genomes/mm9/bed/REST
 
 # Receive bed data file, REST_ChIP_PET_mm9.bed, 
 # from Rory JOHNSON [johnsonrb@gis.a-star.edu.sg].
 
     hgLoadBed mm9 REST REST_ChIP_PET_mm9.bed
 
 # Discovered mm9's extFile and history tables were out of sync.
 # Bob and Hirm fixed the problem.  Reload and it was successful.
 
 # Created REST.html based on Rory's original doc and later updates.
 # Added track definition and search term into trackDb/mouse/mm9/trackDb.ra
 
 # Fix the 0 base problem. (Fan 9/20/09, per Rory's email)
 
     hgsql mm9 -e 'update rest set chromStart = chromStart -1'
 
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 ############################################################################
 # ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).
 
 # Received geneNetwork ID list file, GN_mouse_RefSeq.txt, for mm9 from
 # GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].
 
     ssh hgwdev
     mkdir -p /cluster/data/mm9/bed/geneNetwork
     cd /cluster/data/mm9/bed/geneNetwork
 
     hgsql mm9 < ~/src/hg/lib/geneNetworkId.sql
     hgsql mm9 -e \
     'load data local infile "GN_mouse_RefSeq.txt" into table geneNetworkId'
 
 #########################################################################
 # LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-18 - Galt)
     # original alignment to danRer6
     cd /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17
     cat fb.danRer6.chainMm9Link.txt 
     #   77099032 bases of 1506896106 (5.116%) in intersection
 
     #	running the swap - DONE - 2009-12-18
     mkdir /hive/data/genomes/mm9/bed/blastz.danRer6.swap
     cd /hive/data/genomes/mm9/bed/blastz.danRer6.swap
     time nice +19 doBlastzChainNet.pl -verbose=2 \
         /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17/DEF \
 	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-swap >& swap.log &
     #	real    183m21.102s
     cat fb.mm9.chainDanRer6Link.txt 
     #   73444297 bases of 2620346127 (2.803%) in intersection
 
 
 #######################################################################
 # Vega gene update (DONE - 2010-01-15 - Hiram)
     #	lookup version number at the Vega WEB site:
     #	http://vega.sanger.ac.uk/index.html
     #	and FTP site:
     #	ftp://ftp.sanger.ac.uk/pub/vega/
     cd /hive/data/genomes/mm9
     #	step wise to verify operation
     doEnsGeneUpdate.pl -vegaGene -ensVersion=37 -stop=download mm9.ensGene.ra
     doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
 	-continue=process -stop=process mm9.ensGene.ra
     doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
 	-continue=load -stop=load mm9.ensGene.ra
     doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
 	-continue=cleanup mm9.ensGene.ra
     featureBits mm9 vegaGene
     # 53838752 bases of 2620346127 (2.055%) in intersection
     featureBits mm9 vegaPseudoGene
     # 3060300 bases of 2620346127 (0.117%) in intersection
 
 ######################################################################## 
 # Blastz Rabbit oryCun2 (DONE - 2010-01-15 - Hiram)
     ssh hgwdev
     screen # use screen to control this job
     mkdir /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
     cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
 
     cat << '_EOF_' > DEF
 # Mouse vs. Rabbit
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/mm9.2bit
 SEQ1_LEN=/cluster/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
 SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
 SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
 SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
 SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
 SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=400
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> do.log 2>&1 &
     cat fb.mm9.chainOryCun2Link.txt
 # 670229789 bases of 2620346127 (25.578%) in intersection
 
     #	496428446 bases of 2620346127 (18.945%) in intersection
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=syntenicNet -bigClusterHub=swarm \
 	-syntenicNet > syntenicNet.log 2>&1 &
     #	about 20 minutes
 
     #	create reciprocal best chains/nets
     ssh hgwdev
     cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
     #	this needs blastz.oryCun2 symlink to function
     time nice -n +19 doRecipBest.pl mm9 oryCun2 > rbest.log 2>&1 &
     #	real    37m32.151s
 
     mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
     cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \
 	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
 	> swap.log 2>&1 &
     #	real    84m6.571s
     cat fb.oryCun2.chainMm9Link.txt 
     #	669602734 bases of 2604023284 (25.714%) in intersection
 
 #########################################################################
 # ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
     mkdir /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
     cd /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
 
     cat << '_EOF_' > DEF
 # Mouse vs. Panda
 #	parameters from the Panda paper supplemental where they describe
 #	their lastz parameters
 BLASTZ_K=2200
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_H=2000
 BLASTZ_C=2
 BLASTZ_T=2
 
 # our usual M
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Panda
 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
 SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LIMIT=50
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    501m27.760s
     cat fb.mm9.chainAilMel1Link.txt 
     #	749595031 bases of 2620346127 (28.607%) in intersection
 
     mkdir /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
     cd /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04/DEF \
 	-swap -noLoadChainSplit -bigClusterHub=swarm -smallClusterHub=memk \
 	-workhorse=hgwdev \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    54m57.140s
     cat fb.ailMel1.chainMm9Link.txt 
     #	739076250 bases of 2245312831 (32.916%) in intersection
 
 ############################################################################
 # susScr1 Pig BLASTZ/CHAIN/NET (WORKING - 2010-01-21 - Hiram)
     screen # use a screen to manage this multi-day job
     mkdir /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
     cd /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
 
     cat << '_EOF_' > DEF
 # Pig vs. Mouse
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Pig SusScr1
 SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
 SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	`pwd`/DEF \
 	-noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
     #	real    875m26.114s
     cat fb.mm9.chainSusScr1Link.txt 
     #	616833828 bases of 2620346127 (23.540%) in intersection
 
     mkdir /hive/data/genomes/susScr1/bed/blastz.mm9.swap
     cd /hive/data/genomes/susScr1/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21/DEF \
 	-swap -noLoadChainSplit -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    69m27.221s
     cat fb.susScr1.chainMm9Link.txt 
     #	656445475 bases of 2231332019 (29.419%) in intersection
 
 #########################################################################
 # CRG MAPABILITY (2010-02-05 - 2010-02-09, hartera, DONE)
 # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
 # from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona
 # on 2010-02-04.
 # Data was produced using their GEM mapper aligner taking sliding k-mers 
 # window of the human genome that were mapped back onto the genome with up 
 # to 2mismatches. For each window, a mappability score is computed 
 # S = 1/(nb of match_found) and the BigWig index was created according to 
 # this score.
 # 2010-02-09. Loaded database and added data to /gbdb/
 # Added trackDb entry for the Mapability track.
  
      mkdir -p /hive/data/genomes/mm9/bed/crgMapability
      cd /hive/data/genomes/mm9/bed/crgMapability
 cat << 'EOF' > temp
 #!/bin/tcsh -ef
 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bw.bz2
 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bw.bz2
 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bw.bz2
 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bw.bz2
 http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.bz2
 'EOF'
 
      awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
          temp > download.csh
      rm temp
      chmod +x download.csh
      ./download.csh >& download.log &
      
      # Add the data to /gbdb/ and load the file names into tables (2010-01-26)
      cd /hive/data/genomes/mm9/bed/crgMapability
      bunzip2 *.bz2
      # Add data to gbdb
      mkdir -p /gbdb/mm9/bbi/
      # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/mm9/bbi
      # and load file name into a table - one per dataset. Each table 
      # represents a subtrack.
      foreach f (`ls *.bw`)
         echo $f
         set g=`echo $f | cut -d "-" -f2`
         set num=`echo $g | cut -d "_" -f1`
         set mer=`echo "${num}mer"`
         set nf=`echo "crgMapabilityAlign${mer}.bw"`
         echo $nf
         ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf}
         hgsql mm9 -e "drop table if exists crgMapabilityAlign${mer}; \
      create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
      insert into crgMapabilityAlign${mer} values ('/gbdb/mm9/bbi/${nf}');"
      end
 
      # Added a trackDb entry for this mapability track in
      # kent/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra
      # use bigWigInfo to check min and max values. Created a mapability.html
      # description page. 
 #####################################################################
 # tRNAs track (2010-01-15, Fan DONE)
 #
     ssh hgwdev
     cd /hive/data/genomes/mm9/bed
     mkdir tRNAs
     cd tRNAs
 
 # Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/
 
     cp -p /projects/lowelab/users/lowe/Browser/vertebrates/mm9-tRNAs.bed .
     cp -p \
     /projects/lowelab/users/lowe/Browser/vertebrates/mm9_tRNAs_images.tar.gz\
     .
 
     hgsql mm9 -e 'drop table if exists tRNAs'
     hgLoadBed -tab mm9 tRNAs mm9-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql
 
     mkdir gif
     cd gif
     gzip -d ../mm9_tRNAs_images.tar.gz
     tar -xvf mm9_tRNAs_images.tar
     mkdir /hive/data/gbdb/mm9/RNA-img
     cp -p * /hive/data/gbdb/mm9/RNA-img
 
 #####################################################################
 # LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram)
     #	use a screen to control this job
     screen
     mkdir /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
     cd /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
 
     cat << '_EOF_' > DEF
 # mouse vs marmoset
 BLASTZ_M=50
 
 # TARGET: Mouse Mm9
 SEQ1_DIR=/scratch/data/mm9/nib
 SEQ1_LEN=/scratch/data/mm9/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Marmoset (calJac3)
 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
 SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
 SEQ2_LIMIT=75
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	-verbose=2 `pwd`/DEF \
 	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	> do.log 2>&1 &
     #	real    445m42.381s
     cat fb.mm9.chainCalJac3Link.txt 
     #	859869647 bases of 2620346127 (32.815%) in intersection
 
     mkdir /hive/data/genomes/calJac3/bed/blastz.mm9.swap
     cd /hive/data/genomes/calJac3/bed/blastz.mm9.swap
     time nice -n +19 doBlastzChainNet.pl -verbose=2 \
 	/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12/DEF \
 	-swap -syntenicNet \
 	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
 	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
     #	real    90m38.739s
     cat fb.calJac3.chainHg19Link.txt 
     #	861811978 bases of 2752505800 (31.310%) in intersection
 
 #####################################################################