src/hg/makeDb/doc/mm8.txt 1.76

1.76 2010/01/07 20:47:49 rhead
Added note about renaming mm8 jaxQTL table to jaxQtl.
Index: src/hg/makeDb/doc/mm8.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm8.txt,v
retrieving revision 1.75
retrieving revision 1.76
diff -b -B -U 1000000 -r1.75 -r1.76
--- src/hg/makeDb/doc/mm8.txt	20 Sep 2009 17:16:45 -0000	1.75
+++ src/hg/makeDb/doc/mm8.txt	7 Jan 2010 20:47:49 -0000	1.76
@@ -1,9647 +1,9651 @@
 # for emacs: -*- mode: sh; -*-
 
 
 # This file describes browser build for the mouse
 # genome, February 2006, ncbi mouse_36 - Mm8
 #
 #	"$Id$"
 #
 
 #  NOTE:  this doc may have genePred loads that fail to include
 #  the bin column.  Please correct that for the next build by adding
 #  a bin column when you make any of these tables:
 #
 #  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
 #  +-------------+-------------------------------------+
 #  | tableName   | type                                |
 #  +-------------+-------------------------------------+
 #  | knownGene   | genePred knownGenePep knownGeneMrna |
 #  | refGene     | genePred refPep refMrna             |
 #  | xenoRefGene | genePred xenoRefPep xenoRefMrna     |
 #  | mgcGenes    | genePred                            |
 #  | ensGene     | genePred ensPep                     |
 #  | genscan     | genePred genscanPep                 |
 #  +-------------+-------------------------------------+
 
 
 #######################################################################
 # DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2006-02-14 - Hiram)
 #
 #	Examine disk space issues, find some goodly amount of space
     ssh kkstore01
     mkdir /cluster/store9/mm8
     ln -s /cluster/store9/mm8 /cluster/data/mm8
     cd /cluster/data/mm8
     mkdir ncbi
     cd ncbi
     cp -p /cluster/data/mm7/ncbi/.wgetrc .
     WGETRC=`pwd`/.wgetrc
     export WGETRC
     wget --timestamping --force-directories --directory-prefix=. \
 	--dont-remove-listing --recursive --level=4 --no-parent \
 	--no-host-directories --cut-dirs=1 \
 	ftp://ftp-private.ncbi.nih.gov/mouse_36
     #	Downloaded: 2,201,934,141 bytes in 50 files
     #	real    44m48.975s
 
     #	The pre-release sequence, Feb 27th:
     mkdir /cluster/data/mm8/pre_release
     cd /cluster/data/mm8/pre_release
     #	The .wgetrc is the anonymous user
     WGETRC=`pwd`/.wgetrc
     export WGETRC
     wget --timestamping --force-directories --directory-prefix=. \
 	--dont-remove-listing --recursive --level=4 --no-parent \
 	--no-host-directories --cut-dirs=3 \
 	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release
 
 #  Fixup the agp and seq_contig.md files to add chrM
 #  No chrM or chrMT was delivered.  Copy from previous assembly
     ssh kkstore01
     cd /cluster/data/mm8/ncbi/chrfasta
     cp -p /cluster/data/mm7/ncbi/chrfasta/chrM.fa.gz .
     cd ../contigfasta
     cp -p /cluster/data/mm7/ncbi/contigfasta/chrM.fa.gz .
 #	with a fixed up header line to be like all the others:
 #	>lcl|chrM.fa gi|34538597|ref|NC_005089.1| Mus musculus mitochondrion, complete genome
 
     cd /cluster/data/mm8
     zcat ncbi/allrefcontig.chr.agp.gz > allrefcontig.chr.agp
     echo -e "chrM\t1\t16299\t1\tF\tAY172335.1\t1\t16299\t+" >> \
 	allrefcontig.chr.agp
     gzip allrefcontig.chr.agp
     #	I don't see allcontig.agp being used anywhere else ?
     # zcat ncbi/allcontig.agp.gz > allcontig.agp
     # echo -e "NC_005089\t1\t16299\t1\tF\tAY172335\t\t1\t16299\t+" >> \
     #	    allcontig.agp
     # gzip allcontig.agp
     zcat ncbi/seq_contig.md.gz | egrep -v "Celera|129_substrain" \
 	| sed -e "238i\
 10090\tM\t0\t0\t+\tstart\t-1\tCONTIG\tC57BL/6J\t10\n\
 10090\tM\t1\t16299\t+\tNC_005089\tGI:34538597\tCONTIG\tC57BL/6J\tna\n\
 10090\tM\t16299\t16299\t+\tend\t-2\tCONTIG\tC57BL/6J\t10" > seq_contig.md
     #	(curiously, this sed command would not work on hgwdev,
     #	only when logged into kkstore01 ?)
     #	The line number 238 was found by checking the contents of
     #	ncbi/seq_contig.md.gz (after the egrep filter) and it was
     #	the line starting with:
     #	10090   Un|NT_039877    1       35798
     #	Wanted this chrM information before that line.
     #   summarize sequence counts
 
     mkdir faCounts
     time faCount ncbi/chrfasta/chr*.fa.gz > faCounts/chrfasta.faCount 2>&1 &
     #	about 1.5 minutes
     time faCount ncbi/contigfasta/chr*.fa.gz > \
 	faCounts/contigfasta.faCount 2>&1 &
     #	about 3 minutes
     time zcat ncbi/chrfasta/chr*.fa.gz | grep "^>" > \
 	faCounts/chrfasta.headers 2>&1 &
     time zcat ncbi/contigfasta/chr*.fa.gz | grep "^>" > \
 	faCounts/contigfasta.headers 2>&1 &
     #	about 2 minutes each for the above two zcat/greps
 
 #############################################################################
 #  BREAK UP SEQUENCE INTO 5 MB CHUNKS at NON-BRIDGED CONTIGS
 #			(DONE - 2006-02-14 - Hiram)
 #########  Are these necessary ?  They may no longer be needed.
 #########  TRF can run on full chroms on the kki kluster
 #	It would be better to use . in place of the /cluster/data/mm8
 #	for the outputDir argument to splitFaIntoContigs so this script
 #	is independent of specific locations, thus it works in .
     ssh kkstore01
     cd /cluster/data/mm8
     for F in ncbi/chrfasta/chr*.fa.gz
     do
 	CHR=`basename ${F} | sed -e "s/.fa.gz//; s/chr//"`
 	echo ${CHR} ${F}
 	mkdir -p "${CHR}"
 	zcat allrefcontig.chr.agp.gz | \
 	    perl -we "while(<>){if (/^chr${CHR}\t/) {print;}}" > \
 		${CHR}/chr${CHR}.agp
 	zcat ncbi/chrfasta/chr${CHR}.fa.gz | \
 	    perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' | \
 		splitFaIntoContigs ${CHR}/chr${CHR}.agp \
 		    stdin /cluster/data/mm8 -nSize=5000000
     done
     #	The above loop takes about 5 minutes
     #	Some of these in the chr1 directory got overwritten on 2006-02-27
     #	during an attempt to verify that the pre-release directory at
     #	NCBI was the same as what we worked with here.
 
 #############################################################################
 # CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 2006-02-14 - Hiram)
     ssh kkstore01
     mkdir /cluster/data/mm8/jkStuff
     cd /cluster/data/mm8
     mkdir Un tmp
     cp -p /cluster/data/mm7/jkStuff/ncbiFixAgp ./jkStuff
     zcat allrefcontig.chr.agp.gz | ./jkStuff/ncbiFixAgp /dev/stdin > \
 	allrefcontig.chr.ordered.agp
     #	Set the appropriate release number here, this one is 35
     #	Fetch the script from the previous assembly
     sed -e "s/buildNum = 35/buildNum = 36/" \
 	/cluster/data/mm7/jkStuff/ncbiToRandomAgps > \
 	    jkStuff/ncbiToRandomAgps
     chmod +x jkStuff/ncbiToRandomAgps
     #	NOTE ! * ! This mm8 contig.idmap now includes the celera assembly
     #	Filter that out for use here.
     #	There were two broken lines that began _36 - they were removed
     #	after I reported them and the contig.idmap.gz file here was
     #	updated later.
     zcat ncbi/contig.idmap.gz | grep ref_strain | grep -v "^_36" \
 	| ./jkStuff/ncbiToRandomAgps seq_contig.md \
 		allrefcontig.chr.ordered.agp \
                         /dev/stdin . 2> dbg
     for C in ? ??
     do
 	if [ -s ${C}/chr${C}_random.ctg.agp ]; then
 	    echo "building ${C}/chr${C}_random.fa"
 	    rm -f ./tmp.fa
 	    zcat ncbi/contigfasta/chr${C}.fa.gz | \
 		perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' > ./tmp.fa
 	    agpToFa -verbose=2 -simpleMulti \
 		${C}/chr${C}_random.ctg.agp chr${C}_random \
 		    ${C}/chr${C}_random.fa ./tmp.fa
 	    rm -f ./tmp.fa
 	fi
     done > tmp/agpToFa.out 2>&1
     #	the above loop takes about 3 minutes, examine the tmp/agpToFa.out
     #	record for any errors
 
     #	We need the lift information from these random.ctg.agp files
     cp -p /cluster/data/mm7/jkStuff/agpToLift.pl ./jkStuff
     for AGP in ?/*_random.ctg.agp ??/*_random.ctg.agp
 do
     CHR=`dirname ${AGP}`
     echo ${CHR}
     mkdir -p ${CHR}/lift
     ./jkStuff/agpToLift.pl ${AGP} > ${CHR}/lift/ctg_random.lft
 done
     # Clean these up to avoid confusion later... they're easily rebuilt
     #   with the ncbiToRandomAgps script above
     rm ?/*_random.ctg.agp ??/*_random.ctg.agp
     gzip seq_contig.md allrefcontig.chr.ordered.agp
 
 #############################################################################
 # BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS
 #					(DONE 2006-02-14 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
     for C in ? ??
     do
 	if [ -s ${C}/chr${C}_random.fa ]; then
 	    splitFaIntoContigs  -nSize=5000000 ${C}/chr${C}_random.agp \
 		${C}/chr${C}_random.fa .
 	    mkdir -p ${C}/lift
 	    rm -f ${C}/lift/rOut.lst ${C}/lift/random.lft ${C}/lift/random.lst
 	    mv ${C}_random/lift/oOut.lst ${C}/lift/rOut.lst
 	    mv ${C}_random/lift/ordered.lft ${C}/lift/random.lft
 	    mv ${C}_random/lift/ordered.lst ${C}/lift/random.lst
 	    rmdir ${C}_random/lift
 	    rm ${C}_random/chr${C}_random.agp ${C}_random/chr${C}_random.fa
 	    rm -rf ${C}/chr${C}_random_*
 	    mv ${C}_random/chr${C}_random_* ${C}
 	    rmdir ${C}_random
 	fi
     done > tmp/split.out 2>&1
     #	the above loop takes less than a minute
     #	scan the tmp/split.out file for possible errors
 
 #############################################################################
 # MAKE LIFTALL.LFT (DONE - 2006-02-14 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
     cat ?/lift/*.lft ??/lift/*.lft > jkStuff/liftAll.lft
 
 #############################################################################
 # CREATING DATABASE (DONE - 2006-02-14 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
     faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
 	mm8.2bit
     twoBitInfo mm8.2bit stdout | sort -rn +1 > chrom.sizes
     grep -v random chrom.sizes | cut -f1 | sed -e "s/chr//" > chrom.lst
     twoBitInfo mm8.2bit stdout |
         awk '{printf "%s\t%s\t/gbdb/mm8/mm8.2bit\n", $1,$2}' > chromInfo.tab
 
     ssh hgwdev
     cd /cluster/data/mm8
     hgsql -e "create database mm8;" mysql
     #	Make sure we have enough room (eventually ~ 70Gb) for mysql tables:
     df -h | grep mysql
     #	/dev/sda1             472G  225G  223G  51% /var/lib/mysql2
     #	/dev/sdc1             1.8T  1.5T  190G  89% /var/lib/mysql
 
     # CREATING GRP TABLE FOR TRACK GROUPING (DONE - 2006-02-14 - Hiram)
     #   Use any of the newest databases to ensure that the organization
     #   of the grp table is up to date
     ssh hgwdev
     hgsql mm8 -e "create table grp (PRIMARY KEY(NAME)) select * from hg18.grp"
     hgsql mm8 < $HOME/kent/src/hg/lib/chromInfo.sql
     hgsql mm8 -e 'load data local infile "chromInfo.tab" into table chromInfo;'
 
     # Enter mm8 into dbDb and defaultDb so test browser knows about it:
     hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
         defaultPos, active, orderKey, genome, scientificName, \
         htmlPath, hgNearOk, hgPbOk, sourceName) \
         VALUES("mm8", "Feb 2006", "/gbdb/mm8", "Mouse", \
         "chr6:28912411-28925620", 1, 22, "Mouse", \
         "Mus musculus", "/gbdb/mm8/html/description.html", 0, 0, \
         "NCBI Build 36");' -h localhost hgcentraltest
     #	Reset default position to be like Mm7, 2006-03-09 - Hiram
     hgsql -e \
 'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm8";' \
 	hgcentraltest
     #	Do *NOT* set default genome on genome-test until ready for release
     # hgsql hgcentraltest \
     #	-e 'update defaultDb set name="mm8" where genome="Mouse";'
     # start a new entry in the trackDb hierarchy
     cd $HOME/kent/src/hg/makeDb/trackDb/mouse
     mkdir mm8
     cvs add mm8
     cd mm8
     cp ../mm7/description.html .
     vi description.html - fixup text for this assembly
     cvs add description.html
     cvs commit
     cd ../..
     vi makefile - add mm8 to the list
     mkdir /cluster/data/mm8/html
     mkdir /gbdb/mm8
     ln -s /cluster/data/mm8/html /gbdb/mm8/html
     ln -s /cluster/data/mm8/mm8.2bit /gbdb/mm8/mm8.2bit
     cp -p mouse/mm8/description.html /gbdb/mm8/html
     make DBS=mm8
 
 #############################################################################
 #  GOLD GAP tracks (DONE - 2006-02-14 - Hiram)
     ssh hgwdev
     cd /cluster/data/mm8
     #	make sure these tmp contig agp files are gone, easily generated
     #	as above with jkStuff/ncbiToRandomAgps
     mkdir ffa
     zcat ncbi/sequence.inf.gz > ffa/sequence.inf
     hgGoldGapGl -chromLst=chrom.lst mm8 /cluster/data/mm8 .
     featureBits mm8 gold
     #	2567283971 bases of 2567283971 (100.000%) in intersection
     featureBits mm7 gold
     #	2583394090 bases of 2583394090 (100.000%) in intersection
     featureBits mm6 gold
     #	2597150411 bases of 2597150411 (100.000%) in intersection
     featureBits mm5 gold
     #	2615483787 bases of 2615483787 (100.000%) in intersection
     featureBits mm4 gold
     #	2627444668 bases of 2627444668 (100.000%) in intersection
 
     featureBits mm8 gap
     #	97171117 bases of 2567283971 (3.785%) in intersection
     featureBits mm7 gap
     #	264323239 bases of 2583394090 (10.232%) in intersection
     featureBits mm6 gap
     #	482483041 bases of 2597150411 (18.577%) in intersection
     featureBits mm5 gap
     #	549468286 bases of 2615483787 (21.008%) in intersection
     featureBits mm4 gap
     #	325167539 bases of 2627444668 (12.376%) in intersection
 
 #############################################################################
 # GC5BASE (DONE - 2006-02-14 - Hiram)
     ssh kkstore01
     mkdir -p /cluster/data/mm8/bed/gc5Base
     cd /cluster/data/mm8/bed/gc5Base
     time hgGcPercent -wigOut -doGaps -file=stdout -win=5 mm8 \
         /cluster/data/mm8 | wigEncode stdin gc5Base.wig gc5Base.wib
 
     #       Calculating gcPercent with window size 5
     #       Using twoBit: /cluster/data/mm8/mm7.2bit
     #       File stdout created
     #	Converted stdin, upper limit 100.00, lower limit 0.00
 
     #	runs for about 14 minutes
 
     #	load database
     ssh hgwdev
     cd /cluster/data/mm8/bed/gc5Base
     mkdir /gbdb/mm8/wib
     ln -s `pwd`/gc5Base.wib /gbdb/mm8/wib
     time hgLoadWiggle -pathPrefix=/gbdb/mm8/wib mm8 gc5Base gc5Base.wig
     #	29 second load time
 
     #	verify index is correct:
     hgsql mm8 -e "show index from gc5Base;"
     #	should see good numbers in Cardinality column
 
 #############################################################################
 #  DISTRIBUTE SEQUENCE TO INTERMEDIATE SERVERS FOR KLUSTER RUNS
 #	(DONE - 2006-02-14 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
 
     # break up into 500,000 sized chunks for repeat masker runs
 TOP=`pwd`
 export TOP
 for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
         ??/chr??_random_[0-9]*
 do
     ctg=`basename ${CTG_DIR}`
     cd ${CTG_DIR}
     faSplit size ${ctg}.fa 500000 ${ctg}_ -lift=${ctg}.lft -maxN=500000
     cd ${TOP}
 done > tmp/ctg_split.out 2>&1
     #	about 3 minutes, check the tmp/ctg_split.out for anything unusual
 
     #	make a list of the contigs
 TOP=`pwd`
 export TOP
 for CTG_DIR in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
         ??/chr??_random_[0-9]*
 do
     ctg=`basename ${CTG_DIR}`
     cd ${CTG_DIR}
     ls ${ctg}_* | while read F
     do
         echo ${CTG_DIR}/${F}
     done
     cd ${TOP}
 done > contig500K.lst
     #	count 'em
     wc -l contig500K.lst
     #	5772   contig500K.lst
 
     mkdir /cluster/bluearc/scratch/hg/mm8
     mkdir /cluster/bluearc/scratch/hg/mm8/contigs
     rsync -a --progress --files-from=contig500K.lst . \
         /cluster/bluearc/scratch/hg/mm8/contigs/
 
     #	verify the contig copy above functioned OK
     cd /cluster/bluearc/scratch/hg/mm8
     find ./contigs -type f | wc -l
     #	 5772
 
 #############################################################################
 # SIMPLE REPEAT TRACK (DONE - 2006-02-14 Hiram)
     # TRF can be run in parallel with RepeatMasker
     #   since it doesn't require masked input sequence.
     ssh kkr1u00
     mkdir /iscratch/i/mm8
     cd /iscratch/i/mm8
     mkdir fa
     cd fa
     cp -p /cluster/data/mm8/?/*.fa .
     cp -p /cluster/data/mm8/??/*.fa .
 
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
     done
 
     ssh kki 
     mkdir /cluster/data/mm8/bed/simpleRepeat
     cd /cluster/data/mm8/bed/simpleRepeat
 
     mkdir trf
     cat << '_EOF_' > runTrf
 #!/bin/csh -fe 
 #
 set path1 = /iscratch/i/mm8/fa/$1
 set inputFN = $1  
 set outpath = $2
 set outputFN = $2:t
 mkdir -p /scratch/tmp/$outputFN
 cp $path1 /scratch/tmp/$outputFN
 pushd .
 cd /scratch/tmp/$outputFN
 /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $inputFN /dev/null -bedAt=$outputFN -tempDir=/scratch/tmp
 popd
 rm -f $outpath
 cp -p /scratch/tmp/$outputFN/$outputFN $outpath
 rm -fr /scratch/tmp/$outputFN/*
 rmdir --ignore-fail-on-non-empty /scratch/tmp/$outputFN
 '_EOF_'
     # << happy emacs
     chmod +x runTrf
 
     cat << '_EOF_' > template
 #LOOP
 ./runTrf $(path1) {check out line trf/$(root1).bed}
 #ENDLOOP
 '_EOF_'
     # << keep emacs coloring happy
 
     ls -1S /iscratch/i/mm8/fa > genome.lst
     gensub2 genome.lst single template jobList
     para create jobList
     para try ... check ... push ... etc
     para time
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:      14385s     239.75m     4.00h    0.17d  0.000 y
 # IO & Wait Time:                   794s      13.24m     0.22h    0.01d  0.000 y
 # Average job time:                 446s       7.44m     0.12h    0.01d
 # Longest finished job:            1437s      23.95m     0.40h    0.02d
 # Submission to last job:          1685s      28.08m     0.47h    0.02d
 
     # Load into the database
     ssh hgwdev
     cd /cluster/data/mm8/bed/simpleRepeat
     cat trf/chr*.bed > simpleRepeat.bed
     hgLoadBed -strict mm8 simpleRepeat simpleRepeat.bed \
       -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
     #	Loaded 1141941 elements of size 16
 
     featureBits mm8 simpleRepeat
     # 77752377 bases of 2567283971 (3.029%) in intersection
     featureBits mm7 simpleRepeat
     # 77021175 bases of 2583394090 (2.981%) in intersection
     featureBits mm6 simpleRepeat
     # 83220723 bases of 2597150411 (3.204%) in intersection
     featureBits mm5 simpleRepeat
     # 81414259 bases of 2615483787 (3.113%) in intersection
     featureBits mm4 simpleRepeat
     # 82600648 bases of 2627444668 (3.144%) in intersection
     featureBits mm3 simpleRepeat
     # 75457193 bases of 2505900260 (3.011%) in intersection
 
 ###########################################################################
 # CREATE MICROSAT TRACK (done 2006-7-5 JK)
      ssh hgwdev
      cd /cluster/data/mm8/bed
      mkdir microsat
      cd microsat
      awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' ../simpleRepeat/simpleRepeat.bed > microsat.bed 
     /cluster/bin/i386/hgLoadBed mm8 microsat microsat.bed
 
 
 #############################################################################
 # PROCESS SIMPLE REPEATS INTO MASK (DONE - 2006-02-14 - Hiram)
 
     # After the simpleRepeats track has been built, make a filtered version
     # of the trf output: keep trf's with period <= 12:
     ssh kkstore01
     cd /cluster/data/mm8/bed/simpleRepeat
     mkdir trfMask
     for F in trf/chr*.bed
     do
 	echo "${F} -> ${F/trf\//}"
 	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
     done
 
 #############################################################################
 # REPEATMASKER RUN (after contigs have been distributed to bluearc FS)
 #	(DONE - 2006-02-14 - 2006-02-15 - Hiram)
 #	Record RM version used:
     cat /cluster/bluearc/RepeatMasker060120/Libraries/version
 #   RM database version 20060120
     ssh pk
 
     #- Make the run directory and job list:
     mkdir /cluster/data/mm8/RMRun
     cd /cluster/data/mm8/RMRun
     cat << '_EOF_' > ../jkStuff/RMMouse
 #!/bin/csh -fe
 set C = $1:h
 set F = $1:t
 set R = $F:r
 cd /cluster/data/mm8/$C
 /bin/mkdir -p /scratch/tmp/mm8/$R
 /bin/cp /cluster/bluearc/scratch/hg/mm8/contigs/$1 /scratch/tmp/mm8/$R
 pushd /scratch/tmp/mm8/$R
 /cluster/bluearc/RepeatMasker060120/RepeatMasker -ali -s -species mus $F
 popd
 /bin/cp /scratch/tmp/mm8/$R/$R.fa.out ./
 if (-e /scratch/tmp/mm8/$R/$R.fa.align) /bin/cp /scratch/tmp/mm8/$R/$R.fa.align ./
 if (-e /scratch/tmp/mm8/$R/$R.fa.tbl) /bin/cp /scratch/tmp/mm8/$R/$R.fa.tbl ./
 if (-e /scratch/tmp/mm8/$R/$R.fa.cat) /bin/cp /scratch/tmp/mm8/$R/$R.fa.cat ./
 /bin/rm -fr /scratch/tmp/mm8/$R/*
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8/$R
 /bin/rmdir --ignore-fail-on-non-empty /scratch/tmp/mm8
 '_EOF_'
     #	<< happy emacs
     chmod +x ../jkStuff/RMMouse
 
     cat << '_EOF_' > template
 #LOOP
 ../jkStuff/RMMouse $(path1) {check out line ../$(dir1)/$(root1).fa.out}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 ../contig500K.lst single template jobList
     para create jobList
     wc -l jobList
     #	5772 jobList
     para try ... check ... push ... etc
 # Completed: 6172 of 6172 jobs
 # CPU time in finished jobs:   26381042s  439684.03m  7328.07h  305.34d  0.837 y
 # IO & Wait Time:                 46088s     768.13m    12.80h    0.53d  0.001 y
 # Average job time:                4282s      71.36m     1.19h    0.05d
 # Longest finished job:            6370s     106.17m     1.77h    0.07d
 # Submission to last job:        127318s    2121.97m    35.37h    1.47d
 
     #- Lift up the split-contig .out's to contig-level .out's
     ssh kkstore01
     cd /cluster/data/mm8
     for D in ?/chr?_[0-9]* ??/chr??_[0-9]* ?/chr?_random_[0-9]* \
 	??/chr??_random_[0-9]*
     do
 	CONTIG=`basename ${D}`
 	liftUp ${D}/${CONTIG}.fa.out ${D}/${CONTIG}.lft error \
 		${D}/${CONTIG}_[0-9]*.fa.out
     done > tmp/RM.lift.outs 2>&1
     #	real    2m32.275s
     #	scan tmp/RM.lift.outs for unusual errors or difficulties
 
     cat << '_EOF_' > jkStuff/liftRM_out_to_chr.sh
 #!/bin/sh
 for C in ? ??
 do
     echo "lifting ${C}"
     cd ${C}
     if [ -s lift/ordered.lft ]; then
 	liftUp chr${C}.fa.out lift/ordered.lft error `cat lift/oOut.lst`
     else
 	echo "WARNING: Can not find ${C}/lift/ordered.lft"
     fi
     if [ -s lift/random.lft ]; then
 	liftUp chr${C}_random.fa.out lift/random.lft error `cat lift/rOut.lst`
     fi
     cd ..
 done
 '_EOF_'
     # << happy emacs
     chmod +x jkStuff/liftRM_out_to_chr.sh
     ./jkStuff/liftRM_out_to_chr.sh > tmp/liftRM_out_to_chr.out 2>&1
     #	real    0m24.873s
     #	scan the results tmp/liftRM_out_to_chr.out
     #	there is a single: WARNING: Can not find Un/lift/ordered.lft
     #	which is OK
     #	List the final .out files, nothing should be size 0:
     ls -og */*.fa.out | sort -k3,3nr
 
     #- Load the .out files into the database with:
     ssh hgwdev
     cd /cluster/data/mm8
     hgLoadOut mm8 ?/chr?.fa.out ??/chr??.fa.out ?/chr?_random.fa.out \
 	??/chr??_random.fa.out > tmp/hgLoadOut.out 2>&1
     #	about 7 minutes, there are always a few of these errors:
 
     #	verify everything seems normal compared with previous builds
 
     featureBits mm8 rmsk
     #	1087735582 bases of 2567283971 (42.369%) in intersection
     featureBits mm7 rmsk
     #	1092611581 bases of 2583394090 (42.294%) in intersection
     featureBits mm6 rmsk
     #	1110222842 bases of 2597150411 (42.748%) in intersection
     featureBits mm5 rmsk
     #	1137310280 bases of 2615483787 (43.484%) in intersection
     featureBits mm4 rmsk
     #	1130883581 bases of 2627444668 (43.041%) in intersection
     featureBits mm3 rmsk
     #	1080265553 bases of 2505900260 (43.109%) in intersection
 
 #############################################################################
 # PROCESS REPEAT MASKER AND SIMPLE REPEATS INTO MASKED SEQUENCE
 #		(DONE - 2006-02-16 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
     time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
     do
 	FA=${CHR#*\/}
 	C=${FA%.fa}
 	echo -n "repeat masking ${C} ... "
 	/cluster/bin/i386/maskOutFa -soft ${CHR} ${CHR}.out ${CHR}
 	echo -n "adding simpleRepeats ... "
 	/cluster/bin/i386/maskOutFa -softAdd ${CHR} \
 		bed/simpleRepeat/trfMask/${C}.bed ${CHR}
 	echo "done - ${CHR}"
     done > tmp/addRM_and_Simple.out 2>&1
     #	about 4 minutes
 
     # you will note the usual warnings about troublesome coordinates
     # in the repeat masker outputs - even more than when they were lifted.
 
     #	and make the hard masked sequences from these soft masked sequences
     time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
     do
 	echo "maskOutFa ${CHR} hard ${CHR}.masked"
 	/cluster/bin/i386/maskOutFa ${CHR} hard ${CHR}.masked
     done > tmp/hardMask.out 2>&1
     #	about 2 minutes
 
     #	rebuild the nib file
     time faToTwoBit ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa \
 	mm8Soft.2bit
     #	2 minutes
     #	verify the sequence is still the same size as before:
     twoBitInfo mm8Soft.2bit stdout | sort -rn +1 | sum -r
     #	20673     1
     sum -r chrom.sizes
     #	20673     1
     #	Let's see how much is masked:
     time twoBitToFa mm8Soft.2bit stdout | faSize stdin
     #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
     #	1089350685 lower) in 34 sequences in 1 files
     # and bc says:
     #	1089350685/2664455088 = .408845
     #	1089350685/2567283688 = .424320
 
     #	replace the former unmasked 2bit file with this new one:
     rm mm8.2bit; mv mm8Soft.2bit mm8.2bit
     #	check the browser, make sure it is functioning OK
 
     #	Generate fasta file for random contigs
     #	THIS IS OPTIONAL STUFF, not really needed, well, it is used in
     #	genscan to make the gene names there look pretty.  This script
     #	has been checked into the source tree in hg/utils/lft2BitToFa.pl
     #	use it from there next time
     cp -p /cluster/data/mm7/jkStuff/lft2BitToFa.pl ./jkStuff
 
     mkdir randomContigs
     for L in ?/lift/ctg_random.lft ??/lift/ctg_random.lft
 do
     D=${L/\/lift*}
     echo $L $D
     ./jkStuff/lft2BitToFa.pl mm8.2bit ${L} \
 	> randomContigs/chr${D}_random.ctg.fa
 done
     #
     #	Verify these *.ctg.fa files have the same bases as the ordinary
     #	chr*_random.fa files:
     faSize ?/chr?_random.fa ??/chr??_random.fa
     # 20361100 bases (3250000 N's 17111100 real 7094373 upper 10016727 lower)
     #	in 12 sequences in 12 files
 
     faSize randomContigs/*.ctg.fa
     # 17111100 bases (0 N's 17111100 real 7094373 upper 10016727 lower)
     #	in 77 sequences in 12 files
     #	Note the number of real, upper and lower bases are the same
 
     #	This random contig business isn't actually needed
     #	Create a 2bit file with the full chrom sequences and these
     #	random contigs for use in blastz:
     # faToTwoBit ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
     #	    mm8Chroms_RandomContigs.2bit
 
     #	Copy to bluearc unit for kluster runs
     # cp -p mm8.2bit /cluster/bluearc/mm8
     # cp -p mm8Chroms_RandomContigs.2bit /cluster/bluearc/mm8
     #	And the lift file to go with it
     # cat ?/lift/ctg_random.lft ??/lift/ctg_random.lft \
     #	    > jkStuff/Chroms_RandomContigs.lft
     #	cp -p jkStuff/Chroms_RandomContigs.lft /cluster/bluearc/mm8
 
     #	create full chrom nibs for blastz SEQ1 target with Lin Spec Repeats
     mkdir nib
     for FA in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
 do
     B=${FA/*\/}
     B=${B/.fa/}
     echo faToNib -softMask ${FA} nib/${B}.nib
     rm -f nib/${B}.nib
     faToNib -softMask ${FA} nib/${B}.nib
 done
  
     mkdir /cluster/bluearc/scratch/hg/mm8/nib
     cp -p nib/*.nib /cluster/bluearc/scratch/hg/mm8/nib
     cp -p chrom.sizes /cluster/bluearc/scratch/hg/mm8
     cp -p mm8.2bit /cluster/bluearc/scratch/hg/mm8
     #	The contigs over there are no longer needed
     rm -fr /cluster/bluearc/scratch/hg/mm8/contigs
     #	after lineage specific repeats are created below, this business
     #	can be pushed to the kluster kk nodes and over to the Iservers
 
 #############################################################################
 # PREPARE "bigZips" files for public release
 #	(DONE 2006-02-16 - Hiram)
     ssh kkstore01
     mkdir /cluster/data/mm8/downloads
     mkdir /cluster/data/mm8/downloads/bigZips
     mkdir /cluster/data/mm8/downloads/chromosomes
     cd /cluster/data/mm8/downloads/chromosomes
     cp -p ../../?/chr?.fa ../../??/chr??.fa \
 	../../?/chr?_random.fa ../../??/chr??_random.fa.
     gzip chr*.fa
     #	12 minutes
     #	copy previous release README.txt
     scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/chromosomes/README.txt .
     #	edit it to bring it up to date
     cd /cluster/data/mm8/downloads/bigZips
     #	copy previous release README.txt
     scp hgwdev:/usr/local/apache/htdocs/goldenPath/mm8/bigZips/README.txt .
     # edit README.txt to indicate proper version of sequence and
     #	RepeatMasker
     cd /cluster/data/mm8
     cp -p ?/chr*.fa ??/chr*.fa downloads/chromosomes
     tar cvzf downloads/bigZips/chromAgp.tar.gz ?/chr*.agp ??/chr*.agp
     tar cvzf downloads/bigZips/chromFa.tar.gz ?/chr*.fa ??/chr*.fa
     #	12 minutes
     tar cvzf downloads/bigZips/chromFaMasked.tar.gz ?/chr*.fa.masked \
 	??/chr*.fa.masked
     tar cvzf downloads/bigZips/chromOut.tar.gz ?/chr*.fa.out ??/chr*.fa.out
     cd /cluster/data/mm8/bed/simpleRepeat
     tar cvzf ../../downloads/bigZips/chromTrf.tar.gz ./trfMask
 
     # get GenBank native mRNAs and refGene (DONE 2006-02-23)
     #	after the genbank run was complete
     ssh hgwdev
     cd /cluster/data/genbank
     time ./bin/i386/gbGetSeqs -db=mm8 -native GenBank mrna \
 	/cluster/data/mm8/downloads/bigZips/mrna.fa
     #	2 minutes
     cd /cluster/data/mm8/downloads/bigZips
     gzip mrna.fa
     cd /cluster/data/mm8/downloads/bigZips
     for I in 1000 2000 5000
     do
 	echo "upstream${I} working ... "
 	featureBits mm8 refGene:upstream:${I} -fa=stdout \
 		| gzip -c > upstream${I}.fa.gz
 	echo "upstream${I} done"
     done
     #	real    11m25.493s
 
     ssh kkstore01
     cd /cluster/data/mm8/downloads/bigZips
     cp -p ../../mm8.2bit .
     md5sum *.gz *.2bit README.txt > md5sum.txt
 
     ssh hgwdev
     mkdir -p /usr/local/apache/htdocs/goldenPath/mm8
     ln -s /cluster/data/mm8/downloads/bigZips \
 	/usr/local/apache/htdocs/goldenPath/mm8/bigZips
     ln -s /cluster/data/mm8/downloads/chromosomes \
 	/usr/local/apache/htdocs/goldenPath/mm8/chromosomes
 
 #############################################################################
 # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2006-02-16 - Hiram)
 
     ssh kkr1u00
     mkdir /iscratch/i/mm8/rmsk
     cd /cluster/data/mm8
     cp -p */chr*.fa.out /iscratch/i/mm8/rmsk
     cd /iscratch/i/mm8
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
     done
     cd rmsk
 
     ssh kki
     mkdir /cluster/data/mm8/linSpecRep
     cd /cluster/data/mm8/linSpecRep
     ls -1S /iscratch/i/mm8/rmsk > fa.list
     
     cat << '_EOF_' > mkLSR.csh
 #!/bin/csh -fe
 pushd /iscratch/i/mm8/rmsk
 /cluster/bluearc/RepeatMasker060120/DateRepeats \
 	    $1 -query mouse -comp human -comp rat -comp dog -comp cow \
 		-comp rabbit
 popd
 /bin/cp -p /iscratch/i/mm8/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus .
 '_EOF_'
     #	<< happy emacs
     chmod +x mkLSR.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./mkLSR.csh $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus_oryctolagus-cuniculus}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 fa.list single template jobList
     para try ... check ... push ... etc...
     para time
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:       1338s      22.29m     0.37h    0.02d  0.000 y
 # IO & Wait Time:                   112s       1.87m     0.03h    0.00d  0.000 y
 # Average job time:                  43s       0.71m     0.01h    0.00d
 # Longest finished job:              92s       1.53m     0.03h    0.00d
 # Submission to last job:           181s       3.02m     0.05h    0.00d
 
     ssh kkstore01
     cd /cluster/data/mm8/linSpecRep
     mkdir notInHuman notInRat notInDog notInCow notInRabbit
     for F in chr*.out_homo-sapiens*
     do
 	B=${F/.fa.out*/}
 	echo $B 
         /cluster/bin/scripts/extractRepeats 1 ${F} > \
 		notInHuman/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 2 ${F} > \
 		notInRat/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 3 ${F} > \
 		notInDog/${B}.out.spec
         /cluster/bin/scripts/extractRepeats 4 ${F} > \
 		notInCow/${B}.out.spec
         XXXXX /cluster/bin/scripts/extractRepeats 4 ${F} > \ XXXXX
 		notInRabbit/${B}.out.spec XXXXX
     done
    # NOTE: rabbit should be column 5 instead of 4.
    # This isn't a problem, as we're not using rabbit anyway (see below)
 
     #	the notInHuman, notInDog, notInCow and notInRabit ended up being
     #	identical.  Only the notInRat was different than them
     #	To check identical
     find . -name "*.out.spec" | \
 	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
 	| sort -k1,1n | sort -t"/" -k3,3
     #	Copy to scratch/hg for use in kluster runs
     mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep
     mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
     mkdir /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
     cp -p notInHuman/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
     cp -p notInRat/* /cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
     #	Request this scratch/hg/mm8 directory push to the kk nodes
 
     #	and we can do the Iservers simply:
     ssh kkr1u00
     cd /iscratch/i/mm8
     #	no longer need these two directories
     rm -fr fa rmsk
     rsync -a --progress /cluster/bluearc/scratch/hg/mm8/ .
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/mm8/ kkr${R}u00:/iscratch/i/mm8/
     done
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2006-02-16 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm8", "blat17", "17784", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm8", "blat17", "17785", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 #########################################################################
 # CPGISLANDS (DONE - 2006-02-16 - Hiram)
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/cpgIsland
     cd /cluster/data/mm8/bed/cpgIsland
 
     # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
     cvs co hg3rdParty/cpgIslands
     cd hg3rdParty/cpgIslands
     make
     #	gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ../..
     ln -s hg3rdParty/cpgIslands/cpglh.exe .
     
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     ssh kkstore01
     cd /cluster/data/mm8/bed/cpgIsland
     for F in ../../*/chr*.fa.masked
     do
 	FA=${F/*\/}
 	C=${FA/.fa.masked/}
 	echo "./cpglh.exe ${FA} > ${C}.cpg"
 	./cpglh.exe ${F} > ${C}.cpg
     done > cpglh.out 2>&1 &
     #	about 3 minutes 20 seconds
 
     #	Several chroms have 0 results:
     #	-rw-rw-r--  1     0 Feb 16 15:19 chr10_random.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:20 chr15_random.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:22 chr8_random.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:22 chr9_random.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:22 chrM.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:22 chrX_random.cpg
     #	-rw-rw-r--  1     0 Feb 16 15:22 chrY.cpg
 
     # Transform cpglh output to bed +
     cat << '_EOF_' > filter.awk
 {
 $2 = $2 - 1;
 width = $3 - $2;
 printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
        $1, $2, $3, $5,$6, width,
        $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
 }
 '_EOF_'
     #	<< happy emacs
     awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/cpgIsland
     hgLoadBed -strict mm8 cpgIslandExt -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
     #	Reading cpgIsland.bed
     #	Loaded 15963 elements of size 10
     featureBits mm8 cpgIslandExt
     #	10456823 bases of 2567283971 (0.407%) in intersection
     featureBits mm7 cpgIslandExt
     #	10439328 bases of 2583394090 (0.404%) in intersection
     featureBits mm6 cpgIslandExt
     #	10432360 bases of 2597150411 (0.402%) in intersection
     featureBits mm5 cpgIslandExt
     #	10422989 bases of 2615483787 (0.399%) in intersection
     featureBits mm4 cpgIsland
     #	11109692 bases of 2627444668 (0.423%) in intersection
     featureBits mm3 cpgIsland
     #	10102968 bases of 2505900260 (0.403%) in intersection
 
 #########################################################################
 # ANDY LAW CPGISSLANDS (DONE - 2006-02-16 - Hiram)
     # See notes in makeGalGal2.doc and makeCanFam2.doc
     ssh kkstore01
     mkdir /cluster/data/mm8/bed/cpgIslandGgfAndy
     cd /cluster/data/mm8/bed/cpgIslandGgfAndy
 
     #	Build the preProcGgfAndy program in
     #	kent/src/oneShot/preProcGgfAndy into your ~/bin/$MACHTYPE
 
     # Use masked sequence since this is a mammal...
     for F in ../../*/chr*.fa.masked
     do
 	FA=${F/*\/}
 	C=${FA/.fa.masked/}
 	echo preproc and run on masked "${C} ${F}" 1>/dev/stderr
 	~/bin/$MACHTYPE/preProcGgfAndy ${F} \
 	| /cluster/home/angie/ggf-andy-cpg-island.pl \
 	| perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g1,$oE) = split("\t"); $s--;
                    $gc=$c+$g1;  $pCpG=(100.0 * 2 * $cpg / $n);
                    $pGc=(100.0 * $gc / $n);
                    $_="'${C}'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" .
                         "$pCpG\t$pGc\t$oE\n";'
     done | sort -k1,1 -k2,2n > cpgIslandGgfAndyMasked.bed
 
     # load into database:
     ssh hgwdev
     cd /cluster/data/mm8/bed/cpgIslandGgfAndy
     sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
       $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
     hgLoadBed -strict mm8 cpgIslandGgfAndyMasked -tab -noBin \
       -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
     #	Loaded 67442 elements of size 10
     featureBits mm8 cpgIslandExt
     #	10456823 bases of 2567283971 (0.407%) in intersection
     featureBits mm7 cpgIslandExt
     #	10439328 bases of 2583394090 (0.404%) in intersection
     featureBits mm8 cpgIslandGgfAndyMasked
     #	38850121 bases of 2567283971 (1.513%) in intersection
     featureBits mm7 cpgIslandGgfAndyMasked
     #	38774242 bases of 2583394090 (1.501%) in intersection
     wc -l ../cpgIsland/cpgIsland.bed *bed
     #	15963 ../cpgIsland/cpgIsland.bed
     #	67442 cpgIslandGgfAndyMasked.bed
 
 #########################################################################
 # BLASTZ HUMAN Hg18 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzHg18.2006-02-16
     cd /cluster/data/mm8/bed
     ln -s blastzHg18.2006-02-16 blastz.hg18
     cd blastzHg18.2006-02-16
     #	Started this before the rsync to /scratch/hg/mm8/ had completed,
     #	hence the /cluster/bluearc/scratch/hg/mm8/ location is used
     #	here.
 
     cat << '_EOF_' > DEF
 # mouse vs human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Human Hg18 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/scratch/hg/hg18/nib
 SEQ2_SMSK=/scratch/hg/hg18/linSpecRep/notInMouse
 SEQ2_LEN=/scratch/hg/hg18/chrom.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzHg18.2006-02-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	Started 2006-02-16 16:15
     #	failed due to pk node difficulties, finish the run.blastz
     #	manually
 # Completed: 3724 of 3724 jobs
 # CPU time in finished jobs:    5190293s   86504.89m  1441.75h   60.07d  0.165 y
 # IO & Wait Time:                259150s    4319.16m    71.99h    3.00d  0.008 y
 # Average job time:                1463s      24.39m     0.41h    0.02d
 # Longest finished job:           10621s     177.02m     2.95h    0.12d
 # Submission to last job:         74153s    1235.88m    20.60h    0.86d
 
     #	continuing
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     #	Done 2006-02-17 15:02
 
     #	Then to swap over to Hg18
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > swap.out 2>&1 &
     #	Started 2006-02-17 15:30
 
     ssh hgwdev
     time nice -n +19 featureBits mm8 chainHg18Link
     #	984380268 bases of 2567283971 (38.343%) in intersection
     time nice -n +19 featureBits hg18 chainMm8Link
     #	994530182 bases of 2881515245 (34.514%) in intersection
 
 #########################################################################
 # BLASTZ RAT Rn4 (DONE - 2006-02-16 - 2006-02-18 - Hiram)
     ssh kkr1u00
     cd /iscratch/i/rn4
     rsync -a --progress /cluster/data/rn4/linSpecRep.notInMouse/ \
 	./linSpecRep.notInMouse
     rsync -a --progress /cluster/data/rn4/nib/ ./nib/
     cp -p /cluster/data/rn4/chrom.sizes .
     for R in 2 3 4 5 6 7 8
     do
 	rsync -a --progress /iscratch/i/rn4/ kkr${R}u00:/iscratch/i/rn4/
     done
 
     ssh kk
     mkdir /cluster/data/mm8/bed/blastzRn4.2006-02-16
     cd /cluster/data/mm8/bed
     ln -s blastzRn4.2006-02-16 blastz.rn4
     cd blastzRn4.2006-02-16
     #	Started this before the rsync to /scratch/hg/mm8/ had completed,
     #	hence the /cluster/bluearc/scratch/hg/mm8/ location is used
     #	here.
 
     cat << '_EOF_' > DEF
 # mouse vs rat
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/cluster/bluearc/scratch/hg/mm8/nib
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInRat
 SEQ1_LEN=/cluster/bluearc/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/iscratch/i/rn4/nib
 SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
 SEQ2_LEN=/iscratch/i/rn4/chrom.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzRn4.2006-02-16
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	Started 2006-02-16 16:15
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainRn4Link
     #	1770319811 bases of 2567283971 (68.957%) in intersection
     time nice -n +19 featureBits rn4 chainMm8Link
     #	1791093685 bases of 2571531505 (69.651%) in intersection
 
 ##############################################################################
 # CLONE ENDS - BACEND TRACK (DONE - 2006-02-17 - Hiram)
     ssh kkstore01
     cd /cluster/data/mm8
     # check disk space: 73Gb free
     df -h .
 # Filesystem            Size  Used Avail Use% Mounted on
 # /export/cluster/store5
 #                       1.5T  1.3T   73G  95% /cluster/store5
     mkdir -p bed/cloneend/ncbi
     cd bed/cloneend/ncbi
 
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*
 
     cd /cluster/data/mm8/bed/cloneend
     # seems like the *.mfa files were split just for convenience
     # concatenate
     for F in ncbi/*.mfa.gz
     do
 	zcat ${F}
     done | gzip > all.mfa.gz
 
     # Convert the title line of the all.mfa file
     cat << '_EOF_' > convert.pl
 #!/usr/bin/env perl
 
 use strict;
 use warnings;
 
 while (my $line = <>) {
     if ($line !~ m/^>/) {
 	print $line
     } else {
         my @fields = split('\|', $line);
 	my $fieldCount = scalar(@fields);
         my $printed = 0;
         for (my $i = 0; $i < $fieldCount; $i++) {
                 if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
                         (my $name, my $vers) = split(/\./,$fields[$i+1]);
                         print ">$name\n";
                         $i= $fieldCount;
                         $printed = 1;
                 }
         }
         if (!$printed) {
                 die("Failed for $line\n");
         }
     }
 }
 '_EOF_'
     # << happy emacs
     chmod +x convert.pl
     zcat all.mfa.gz | ./convert.pl | gzip > cloneEnds.fa.gz
 
     #	make sure nothing got broken:
     faSize all.mfa.gz
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
 # lower) in 789466 sequences in 1 files
 
     faSize cloneEnds.fa.gz
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
 # lower) in 789466 sequences in 1 files
     #	identical numbers, curiously, these are exactly the same numbers
     #	as were seen during the build of Mm7.  Do these things not
     #	change with time ?
 
     # concatenate the text files, too
     for F in ncbi/*.txt.gz
     do
 	zcat ${F}
     done | gzip > all.txt.gz
 
     # generate cloneEndPairs.txt and cloneEndSingles.txt
     cp -p /cluster/data/mm7/bed/cloneend/convertTxt.pl .
     zcat all.txt.gz | ./convertTxt.pl stdin
     # Reading in end info
     # Writing out pair info
     # Writing out singleton info
     # 354485 pairs and 78423 singles
 
     #	faSplit does not function correctly if given a .gz source file
     #	AND, we need the unzipped file for sequence loading below
     gunzip cloneEnds.fa.gz
     # split
     mkdir splitdir
     cd splitdir
     faSplit sequence ../cloneEnds.fa 100 cloneEnds
     #	Check to ensure no breakage:
     cat *.fa | faSize stdin
 # 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
 # lower) in 789466 sequences in 1 files
     #	same numbers as before
 
     #	Copy to san for cluster runs
     mkdir /san/sanvol1/scratch/mm8/cloneEnds
     cp -p *.fa /san/sanvol1/scratch/mm8/cloneEnds
     rm *
     cd ..
     rmdir splitdir
     #	may as well remove the previous assembly copy:
     rm -fr /san/sanvol1/scratch/mm7/cloneEnds
 
     # load sequences
     ssh hgwdev
     mkdir /gbdb/mm8/cloneend
     cd /gbdb/mm8/cloneend
     ln -s /cluster/data/mm8/bed/cloneend/cloneEnds.fa .
     cd /tmp
     hgLoadSeq mm8 /gbdb/mm8/cloneend/cloneEnds.fa
     #  Advisory lock created
     # Creating .tab file
     # Adding /gbdb/mm8/cloneend/cloneEnds.fa
     # 789466 sequences
     # Updating seq table
     # Advisory lock has been released
     # All done
 
 ############################################################################
 # BACEND SEQUENCE ALIGNMENTS (DONE - 2006-02-17 - 2006-02-22 - Hiram)
     ssh kkstore01
     mkdir /cluster/data/mm8/noMask
     cd /cluster/data/mm8/
     #	Need an unmasked sequence for this work
     for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
     do
 	C=`basename ${CHR}`
 	echo -n "working ${C} ... "
 	head -1 ${CHR} > noMask/${C}
 	tail +2 ${CHR} | tr [:lower:] [:upper:] >> noMask/${C}
 	echo "done"
     done
     mkdir /san/sanvol1/scratch/mm8/noMask
     time cp --verbose -p noMask/chr*.fa /san/sanvol1/scratch/mm8/noMask
 
     #	Size of mouse non-gap genome: 2567283971
     #	Size of  Hg18 non-gap genome: 2881515245
     #	Adjusting the 1024 number from typical human ooc generation:
     #	1024 * (2567283971 / 2881515245) = 912
 
     time blat mm8.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
     #	Wrote 29643 overused 11-mers to 11.ooc
     #	real    2m13.206
     # Copy over to the san
     cp -p 11.ooc /san/sanvol1/scratch/mm8
 
     
     #	and for the kluster run
     ssh pk
     mkdir /cluster/data/mm8/bed/bacends
     cd /cluster/data/mm8/bed/bacends
     mkdir out
 
     # allow blat to run politely in /tmp while it writes output, then
     # copy results to results file:
     cat << '_EOF_' > runBlat.sh
 #!/bin/sh
 root1=$1
 root2=$2
 result=$3
 rm -fr /scratch/tmp/${root1}_${root2}
 mkdir /scratch/tmp/${root1}_${root2}
 pushd /scratch/tmp/${root1}_${root2}
 /cluster/bin/x86_64/blat /san/sanvol1/scratch/mm8/noMask/${root1}.fa \
 	/san/sanvol1/scratch/mm8/cloneEnds/${root2}.fa \
 	-ooc=/san/sanvol1/scratch/mm8/11.ooc ${root1}.${root2}.psl
 popd
 mkdir -p out/${root2}
 rm -f ${result}
 mv /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
 rm -fr /scratch/tmp/${root1}_${root2}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat.sh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat.sh $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << emacs happy
 
     ls -1S /san/sanvol1/scratch/mm8/cloneEnds/cloneEnds???.fa > bacEnds.lst
     ls -1S /san/sanvol1/scratch/mm8/noMask/chr*.fa > contig.lst
     gensub2 contig.lst bacEnds.lst template jobList
     para create jobList
     # 3322 jobs written to batch
     para try, check, push, etc ...
 # Completed: 3332 of 3332 jobs
 # CPU time in finished jobs:     649465s   10824.42m   180.41h    7.52d  0.021 y
 # IO & Wait Time:                 11633s     193.88m     3.23h    0.13d  0.000 y
 # Average job time:                 198s       3.31m     0.06h    0.00d
 # Longest finished job:            1326s      22.10m     0.37h    0.02d
 # Submission to last job:        429201s    7153.35m   119.22h    4.97d
 
     ssh kkstore01
     cd /cluster/data/mm8/bed/bacends
     screen
 
     mkdir temp
     time pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
     #	real    22m4.019s
     #	-rw-rw-r--    1 8422362557 Feb 22 15:35 raw.psl
 
     time pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 -noIntrons \
 	raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
     #	real    6m15.981s
     #	-rw-rw-r--    1  197029888 Feb 22 15:37 bacEnds.psl
 
     #	utilize the scripts from the previous build
     cp -p /cluster/data/mm7/bed/bacends/split.pl .
     cp -p /cluster/data/mm7/bed/bacends/header .
 
     time ./split.pl header < bacEnds.psl
     #	real    0m26.983s
 
     mv bacEnds.psl bacEnds.psl.save
     time pslSort dirs bacEnds.psl temp split
     #	real    2m19.131s
     #	-rw-rw-r--    1 1227866614 Feb 22 15:48 bacEnds.psl
 
     # Copy files to final destination and remove
     mkdir /cluster/data/mm8/bacends
     cp -p bacEnds.psl /cluster/data/mm8/bacends
 
 ############################################################################
 # BACEND PAIRS TRACK (DONE - 2006-02-22 - Hiram)
 
     ssh kolossus
     cd /cluster/data/mm8/bacends
 
 time /cluster/bin/x86_64/pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
 -max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
 -mismatch -verbose bacEnds.psl \
 	../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
     #	real    0m47.401s
 
 
     # create header required by "rdb" tools
     echo -e \
 "chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
     echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header
 
     cat header bacEnds.pairs | \
 	/cluster/bin/scripts/row score ge 300 | \
 	/cluster/bin/scripts/sorttbl chr start | \
 	/cluster/bin/scripts/headchg -del > bacEndPairs.bed
     #	-rw-rw-r--  1   23816801 Feb 22 15:52 bacEndPairs.bed
 
 
     cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
 	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
 	/cluster/bin/scripts/sorttbl chr start | \
 	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
     #	-rw-rw-r--  1    6843775 Feb 22 15:54 bacEndPairsBad.bed
 
     /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
 	bacEndPairsBad.bed >j1.out
     cat j1.out| /cluster/bin/scripts/sorttbl tname tstart >j2.out
     cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
     #	-rw-rw-r--  1  983668200 Feb 22 16:04 bacEnds.load.psl
 
     rm j1.out j2.out
 
     #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
     awk '{print $5}' bacEndPairs.bed | sort -u
     #	result should be the scores, no extraneous strings:
 #	1000
 #	300
 #	375
 #	500
 #	750
     #	edit the file and fix it if it has a bad name.
 
     # load into database
     ssh hgwdev
     cd /cluster/data/mm8/bacends
     hgLoadBed -strict -notItemRgb mm8 bacEndPairs bacEndPairs.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
     #	Loaded 235440 elements of size 11
 
     # note - this track isn't pushed to RR, just used for assembly QA
     hgLoadBed -strict -notItemRgb mm8 bacEndPairsBad bacEndPairsBad.bed \
 	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
     #	Loaded 95099 elements of size 11
 
     # NOTE: truncates file to 0 if -nobin is used
     time hgLoadPsl mm8 -table=all_bacends bacEnds.load.psl
 # load of all_bacends did not go as planned: 8132116 record(s), 0 row(s)
 # skipped, 1 warning(s) loading psl.tab
 #	skipped, 1 warning(s) loading psl.tab
 #	real    20m45.055s
 
     featureBits mm8 all_bacends
 # 327086559 bases of 2567283971 (12.741%) in intersection
     featureBits mm7 all_bacends
 # 334161740 bases of 2583394090 (12.935%) in intersection
     featureBits mm6 all_bacends
 # 336981828 bases of 2597150411 (12.975%) in intersection
     featureBits mm5 all_bacends
 # 268502414 bases of 2615483787 (10.266%) in intersection
     featureBits mm4 all_bacends
 # 243096171 bases of 2627444668 (9.252%) in intersection
 
     featureBits mm8 bacEndPairs
 # 2572527283 bases of 2567283971 (100.204%) in intersection
     featureBits mm7 bacEndPairs
 # 2578837424 bases of 2583394090 (99.824%) in intersection
     featureBits mm6 bacEndPairs
 # 2570768812 bases of 2597150411 (98.984%) in intersection
     featureBits mm5 bacEndPairs
 # 2567958504 bases of 2615483787 (98.183%) in intersection
     featureBits mm4 bacEndPairs
 # 2549945356 bases of 2627444668 (97.050%) in intersection
 
     featureBits mm8 bacEndPairsBad
 # 879222026 bases of 2567283971 (34.247%) in intersection
     featureBits mm7 bacEndPairsBad
 # 954662115 bases of 2583394090 (36.954%) in intersection
     featureBits mm6 bacEndPairsBad
 # 1006314997 bases of 2597150411 (38.747%) in intersection
     featureBits mm5 bacEndPairsBad
 # 541027882 bases of 2615483787 (20.686%) in intersection
     featureBits mm4 bacEndPairsBad
 # 1074505863 bases of 2627444668 (40.895%) in intersection
 
 #########################################################################
 # GENBANK auto update (DONE - 2006-02-17 - 2006-02-23 - Hiram)
     # align with revised genbank process. drop xeno ESTs.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvs update -d -P etc
     # edit etc/genbank.conf to add mm8, it is a copy of mm7 with changes:
 
 # mm8
 mm8.serverGenome = /cluster/data/mm8/mm8.2bit
 mm8.clusterGenome = /scratch/hg/mm8/mm8.2bit
 mm8.ooc = /cluster/data/mm8/11.ooc
 mm8.align.unplacedChroms = chrUn_random
 mm8.lift = /cluster/data/mm8/jkStuff/liftAll.lft
 mm8.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
 mm8.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
 mm8.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
 mm8.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
 mm8.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
 mm8.downloadDir = mm8
 mm8.refseq.mrna.xeno.load  = yes
 mm8.refseq.mrna.xeno.loadDesc = yes
 mm8.mgcTables.default = full
 mm8.mgcTables.mgc = all
 
     #	check that into CVS, then
     # update /cluster/data/genbank/
     make etc-update
 
     ssh kkstore04
     cd /cluster/data/genbank
     nice bin/gbAlignStep -initial mm8 &
     #	var/build/logs/2006.02.17-16:10:17.mm8.initalign.log
     #	the parasol batch job on kk broke down in:
     #	/cluster/bluearc/genbank/work/initial.mm8/align
     #	go to kk and this directory and get the batch finished
     nice bin/gbAlignStep -continue=finish -initial mm8 &
     #	var/build/logs/2006.02.22-20:26:54.mm8.initalign.log
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     nice ./bin/gbDbLoadStep -drop -initialLoad  mm8 &
     #	var/dbload/hgwdev/logs/2006.02.23-10:21:36.dbload.log
     #	real    228m59.734s
 
 #########################################################################
 # BLASTZ rheMac2 (DONE - 2006-02-17 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
     cd /cluster/data/mm8/bed
     ln -s blastz.rheMac2.2006-02-17 blastz.rheMac2
     cd blastz.rheMac2
 
     cat << '_EOF_' > DEF
 # mouse vs macaca mulatta
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 ALIGN=blastz-run
 BLASTZ=blastz.v7.x86_64
 
 # TARGET - mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY - macaca mulatta - big enough chunk to do whole chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/rheMac2/rheMac2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/rheMac2/rheMac2.sizes 
 SEQ2_CHUNK=250000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastz.rheMac2.2006-02-17
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	Started 2006-02-17 16:42
     #	crashed due to no copies of mm8 in /scratch/hg/mm8/ on the
     #	Iservers.  Fix that up and get the chain run done.  Continuing.
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
     #	failed during a san hiccup,  finish that off, then:
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap -continue=load `pwd`/DEF > swap.load.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainRheMac2Link
     #	891310108 bases of 2567283971 (34.718%) in intersection
     time nice -n +19 featureBits rheMac2 chainMm8Link
     #	877906099 bases of 2646704109 (33.170%) in intersection
 
 #########################################################################
 # BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.canFam2.2006-02-18
     cd /cluster/data/mm8/bed
     ln -s blastz.canFam2.2006-02-18 blastz.canFam2
     cd blastz.canFam2
 
     cat << '_EOF_' > DEF
 # mouse vs dog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Dog CanFam2 - chunk big enough to do all chroms in single whole pieces
 SEQ2_DIR=/scratch/hg/canFam2/nib
 SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
 SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzCanFam2.2006-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainCanFam2Link
     #	828741604 bases of 2567283971 (32.281%) in intersection
     time nice -n +19 featureBits canFam2 chainMm8Link
     #	816262344 bases of 2384996543 (34.225%) in intersection
 
 #########################################################################
 # BLASTZ bosTau2 (DONE - 2006-02-18 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.bosTau2.2006-02-18
     cd /cluster/data/mm8/bed
     ln -s blastz.bosTau2.2006-02-18 blastz.bosTau2
     cd blastz.bosTau2
 
     cat << '_EOF_' > DEF
 # mouse vs cow
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Cow (bosTau2)
 #  large enough chunk to do chroms in one piece
 SEQ2_DIR=/scratch/hg/bosTau2/bosTau2.noBin0.2bit
 SEQ2_LEN=/scratch/hg/bosTau2/noBin0.sizes
 SEQ2_CHUNK=150000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzBosTau.2006-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainBosTau2Link
     #	688859641 bases of 2567283971 (26.832%) in intersection
     time nice -n +19 featureBits bosTau2 chainMm8Link
     #	683178156 bases of 2812203870 (24.293%) in intersection
 
 #########################################################################
 # BLASTZ galGal2 (DONE - 2006-02-18 - Hiram)
     ssh kk
     mkdir /cluster/data/mm8/bed/blastz.galGal2.2006-02-18
     cd /cluster/data/mm8/bed
     ln -s blastz.galGal2.2006-02-18 blastz.galGal2
     cd blastz.galGal2
 
     cat << '_EOF_' > DEF
 # mouse vs chicken
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal2 - single chunk big enough for whole chroms at once
 SEQ2_DIR=/scratch/hg/galGal2/nib
 SEQ2_LEN=/scratch/hg/galGal2/chrom.sizes
 SEQ2_SMSK=/scratch/hg/galGal2/linSpecRep
 SEQ2_IN_CONTIGS=0
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzGalGal2.2006-02-18
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF > net.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainGalGal2Link
     #	65517358 bases of 2567283971 (2.552%) in intersection
     time nice -n +19 featureBits galGal2 chainMm8Link
     #	57074100 bases of 1054197620 (5.414%) in intersection
 
 #########################################################################
 # BLASTZ dasNov1 (DONE - 2006-02-19 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.dasNov1.2006-02-19
     cd /cluster/data/mm8/bed
     ln -s blastz.dasNov1.2006-02-19 blastz.dasNov1
     cd blastz.dasNov1
 
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 
 # QUERY - Armadillo dasNov1
 SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
 SEQ2_LEN=/scratch/hg/dasNov1/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzDasNov1.2006-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainDasNov1Link
     #	431944142 bases of 2567283971 (16.825%) in intersection
 
 #########################################################################
 # BLASTZ echTel1 (DONE - 2006-02-19 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.echTel1.2006-02-19
     cd /cluster/data/mm8/bed
     ln -s blastz.echTel1.2006-02-19 blastz.echTel1
     cd blastz.echTel1
 
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 
 # QUERY - Tenrec echTel1
 SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
 SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzEchTel1.2006-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=chainRun `pwd`/DEF > chain.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainEchTel1Link
     #	292970406 bases of 2567283971 (11.412%) in intersection
 
 #########################################################################
 # BLASTZ fr1 (DONE - 2006-02-19 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.fr1.2006-02-19
     cd /cluster/data/mm8/bed
     ln -s blastz.fr1.2006-02-19 blastz.fr1
     cd blastz.fr1
 
     cat << '_EOF_' > DEF
 # mouse vs. fugu
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
  
 BLASTZ=blastz.v7
 
 # Reuse parameters from human-chicken, except L=6000 (more relaxed)
 BLASTZ_H=2000
 BLASTZ_Y=3400 
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Fugu - chunk big enough to run the whole chrom at once
 SEQ2_DIR=/san/sanvol1/scratch/fr1/nib
 SEQ2_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
 SEQ2_CHUNK=400000000
 SEQ2_LAP=0
  
 BASE=/cluster/data/mm8/bed/blastzFr1.2006-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainFr1Link
     #	48949500 bases of 2567283971 (1.907%) in intersection
     time nice -n +19 featureBits fr1 chainMm8Link
     #	42671288 bases of 315518167 (13.524%) in intersection
 
 #########################################################################
 # BLASTZ loxAfr1 (DONE - 2006-02-19 - Hiram)
     ssh kk
     mkdir /cluster/data/mm8/bed/blastz.loxAfr1.2006-02-19
     cd /cluster/data/mm8/bed
     ln -s blastz.loxAfr1.2006-02-19 blastz.loxAfr1
     cd blastz.loxAfr1
 
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY - Elephant loxAfr1
 SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
 SEQ2_LEN=/scratch/hg/loxAfr1/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzLoxAfr1.2006-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	failed during the cat, fixed the script
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=chainRun `pwd`/DEF > chain.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainLoxAfr1Link
     #	472168702 bases of 2567283971 (18.392%) in intersection
 
 #########################################################################
 # BLASTZ tetNig1 (DONE - 2006-02-19 - Hiram)
     ssh kk
     mkdir /cluster/data/mm8/bed/blastz.tetNig1.2006-02-19
     cd /cluster/data/mm8/bed
     ln -s blastz.tetNig1.2006-02-19 blastz.tetNig1
     cd blastz.tetNig1
 
     cat << '_EOF_' > DEF
 # Mouse vs tetraodon
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7
 
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Tetraodon TetNig1 - single chunk big enough to run whole chroms
 SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzTetNig1.2006-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF > net.out 2>&1 &
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap -continue=net `pwd`/DEF > swap-net.out 2>&1 &
 
 
     time nice -n +19 featureBits mm8 chainTetNig1Link
     #	50358792 bases of 2567283971 (1.962%) in intersection
     time nice -n +19 featureBits tetNig1 chainMm8Link
     #	47024263 bases of 342403326 (13.734%) in intersection
 
 #########################################################################
 # BLASTZ oryCun1 (DONE - 2006-02-21 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.oryCun1.2006-02-21
     cd /cluster/data/mm8/bed
     ln -s blastz.oryCun1.2006-02-21 blastz.oryCun1
     cd blastz.oryCun1
 
     cat << '_EOF_' > DEF
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY - Rabbit oryCun1
 SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
 SEQ2_LEN=/scratch/hg/oryCun1/chrom.sizes
 SEQ2_LIMIT=100
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzOryCun1.2006-02-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainOryCun1Link
     #	496060619 bases of 2567283971 (19.322%) in intersection
 
 #########################################################################
 # BLASTZ xenTro1 (DONE - 2006-02-21 - Hiram)
     ssh kk
     mkdir /cluster/data/mm8/bed/blastz.xenTro1.2006-02-21
     cd /cluster/data/mm8/bed
     ln -s blastz.xenTro1.2006-02-21 blastz.xenTro1
     cd blastz.xenTro1
 
     cat << '_EOF_' > DEF
 # mouse vs. frog
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro1 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro1/xenTro1.2bit
 SEQ2_LEN=/scratch/hg/xenTro1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/mm8/bed/blastzXenTro1.2006-02-21
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainXenTro1Link
     #	62015601 bases of 2567283971 (2.416%) in intersection
     time nice -n +19 featureBits xenTro1 chainMm8Link
     #	59307185 bases of 1381238994 (4.294%) in intersection
 
 #########################################################################
 # BLASTZ monDom4 (DONE - 2006-02-23 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.monDom4.2006-02-23
     cd /cluster/data/mm8/bed
     ln -s blastz.monDom4.2006-02-23 blastz.monDom4
     cd blastz.monDom4
 
     cat << '_EOF_' > DEF
 # Mouse vs. opossum
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_M=20
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse (mm8)
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=100000000
 SEQ1_LAP=10000
 
 # QUERY: Opossum monDom2
 SEQ2_DIR=/cluster/bluearc/scratch/hg/monDom4/monDom4.2bit
 SEQ2_LEN=/cluster/bluearc/scratch/hg/monDom4/chrom.sizes
 SEQ2_CHUNK=50000000
 SEQ2_LIMIT=100
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzMonDom4.2006-02-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainMonDom4Link
     #	211663336 bases of 2567283971 (8.245%) in intersection
     time nice -n +19 featureBits monDom4 chainMm8Link
     #	210933035 bases of 3501643220 (6.024%) in intersection
 
     #	Something caused the loaded chains and nets on Mm8 to disappear.
     #	to reload them  (DONE - Hiram - 2006-07-18)
     #	recover the individual chain files
     ssh kkstore04
     cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
     nice chainSplit chain mm8.monDom4.all.chain.gz
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain/chain
     foreach f (*.chain)
 	set c = $f:r
 	echo hgLoadChain mm8 ${c}_chainMonDom4 $f
 	hgLoadChain mm8 ${c}_chainMonDom4 $f
     end
 
     time netFilter -minGap=10 mm8.monDom4.net.gz \
 	| hgLoadNet -verbose=0 mm8 netMonDom4 stdin
 
     #	clean up
     ssh kkstore04
     cd /cluster/data/mm8/bed/blastzMonDom4.2006-02-23/axtChain
     rm -fr chain
 
 #########################################################################
 # BLASTZ panTro1 (DONE - 2006-02-23 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.panTro1.2006-02-23
     cd /cluster/data/mm8/bed
     ln -s blastz.panTro1.2006-02-23 blastz.panTro1
     cd blastz.panTro1
 
     cat << '_EOF_' > DEF
 # mouse vs chimp
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_M=50
 
 # TARGET: Mouse Mm7
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp PanTro1
 SEQ2_DIR=/scratch/hg/panTro1/nib
 SEQ2_LEN=/scratch/hg/panTro1/chrom.sizes
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzPanTro1.2006-02-23
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainPanTro1Link
     #	901276629 bases of 2567283971 (35.106%) in intersection
     time nice -n +19 featureBits panTro1 chainMm8Link
     #	901976621 bases of 2733948177 (32.992%) in intersection
 
 #########################################################################
 # BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
     cd /cluster/data/mm8/bed
     ln -s blastzDanRer4.2006-04-26 blastz.danRer4
     cd blastz.danRer4
 
     cat << '_EOF_' > DEF
 # mouse vs zebrafish
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer4)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer4/chromNib
 SEQ2_LEN=/san/sanvol1/scratch/danRer4/chromNib.sizes
 SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
 SEQ2_CHUNK=100000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-04-26
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-04-26
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF > net.out 2>&1 &
 
     #	swap, see also makeDanRer4.doc
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
     time nice -n +19 featureBits mm8 chainDanRer4Link \
 	> fb.mm8.chainDanRer4Link 2>&1 &
     cat fb.mm8.chainDanRer4Link
     #	54036008 bases of 2567283971 (2.105%) in intersection
     time nice -n +19 featureBits danRer4 chainMm8Link \
 	> fb.danRer4.chainDanRer4Link 2>&1 &
     cat fb.danRer4.chainDanRer4Link
     #	58145856 bases of 1626093931 (3.576%) in intersection
 
 #########################################################################
 # BLASTZ danRer4 (DONE - 2006-04-26 - 2006-04-28 - Hiram)
 # REMAKE THIS USING ALL CHROMS FOR danRer4 (2005-05-22 -  ).
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
    # ln -s blastzDanRer4.2006-04-26 blastz.danRer4
 
     cat << '_EOF_' > DEF
 # mouse vs zebrafish
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer4)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer4/nib
 SEQ2_LEN=/san/sanvol1/scratch/danRer4/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/scratch/danRer4/linSpecRep.notInOthers
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzDanRer4.2006-05-22
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
     chmod +x DEF
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF >& blastz.out &
     # 0.118u 0.107s 4:05:08.71 0.0%   0+0k 0+0io 0pf+0w
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-continue=net `pwd`/DEF >& net.out &
     # 0.121u 0.072s 4:48.04 0.0%      0+0k 0+0io 0pf+0w
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     #	swap, see also makeDanRer4.doc
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF >& swap.out &
     # 0.129u 0.109s 5:02.55 0.0%      0+0k 0+0io 0pf+0w 
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastzDanRer4.2006-05-22
     featureBits mm8 chainDanRer4Link >& fb.mm8.chainDanRer4Link &
     cat fb.mm8.chainDanRer4Link
     # 55147954 bases of 2567283971 (2.148%) in intersection
     featureBits danRer4 chainMm8Link >& fb.danRer4.chainDanRer4Link &
     cat fb.danRer4.chainDanRer4Link
     # 60721886 bases of 1626093931 (3.734%) in intersection
     featureBits -chrom=chr1 mm8 refGene:cds chainDanRer4Link -enrichment
     # refGene:cds 0.856%, chainDanRer4Link 1.867%, both 0.584%, 
     # cover 68.16%, enrich 36.51x
     featureBits -chrom=chr1 mm8 refGene:cds chainDanRer3Link -enrichment
     # refGene:cds 0.856%, chainDanRer3Link 1.760%, both 0.492%, cover 57.49%, 
     # enrich 32.67x
     featureBits -chrom=chr1 danRer4 refGene:cds chainMm8Link -enrichment
     # refGene:cds 0.746%, chainMm8Link 3.807%, both 0.566%, cover 75.86%, 
     # enrich 19.93x
     featureBits -chrom=chr1 danRer3 refGene:cds chainMm8Link -enrichment
     # refGene:cds 0.786%, chainMm8Link 4.581%, both 0.612%, cover 77.88%, 
     # enrich 17.00x
     # Higher coverage than for danRer3 chains on mm8 and similar coverage
     # for mm8 chains on danRer4 as on danRer3 so that is good.
  
 #########################################################################
 # BLASTZ danRer3 (DONE - 2006-02-28 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.danRer3.2006-02-28
     cd /cluster/data/mm8/bed
     ln -s blastz.danRer3.2006-02-28 blastz.danRer3
     cd blastz.danRer3
 
     cat << '_EOF_' > DEF
 # mouse vs zebrafish
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn/x86_64:/cluster/bin/x86_64:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 
 # Reuse parameters from hg16-fr1, danRer-hg17 and mm5-danRer
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: Zebrafish (danRer3)
 #  large enough chunk to do complete chroms at once
 SEQ2_DIR=/san/sanvol1/scratch/danRer3/chromNib
 SEQ2_LEN=/san/sanvol1/scratch/danRer3/chromNib.sizes
 SEQ2_CHUNK=100000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzDanRer3.2006-02-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    216m23.425s
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap `pwd`/DEF > swap.out 2>&1 &
 
 
     time nice -n +19 featureBits mm8 chainDanRer3Link
     #	53125783 bases of 2567283971 (2.069%) in intersection
     time nice -n +19 featureBits danRer3 chainMm8Link
     #	54831876 bases of 1630323462 (3.363%) in intersection
 
 #############################################################################
 # STS MARKERS DATA DOWNLOAD (DONE - 2006-02-23 - 2006-02-28 - Hiram)
 ###   *** PLEASE NOTE - STS markers redone 2006-08-29 - look for section:
 ##  redoing STS markers track to get them more correct
 ###	later in this file
     ssh kkstore01
     mkdir -p /cluster/data/mm8/bed/STSmarkers/downloads
     cd /cluster/data/mm8/bed/STSmarkers/downloads
     # these files appear to be new almost every day
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
 
     #	The new feature in the .aliases file this time are names with
     #	spaces in them !  This changes our parsing business below,
     #	hopefully the spaces in the names won't cause trouble elsewhere.
 
     wget --timestamping \
 ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*
 
     # these reports from jax.org appear to be changing daily
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
     wget --timestamping \
 	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
     ls -ogrt
 #	-rw-rw-r--  1      676 Mar 11  2004 README
 #	-rw-rw-r--  1   396858 Jan 28  2005 10090.MGI.txt
 #	-rw-rw-r--  1   390139 Mar 16  2005 10090.WI_MRC_RH.txt
 #	-rw-rw-r--  1   240688 Mar 16  2005 10090.WI-YAC.txt
 #	-rw-rw-r--  1   173344 Mar 16  2005 10090.WI-Genetic.txt
 #	-rw-rw-r--  1 25691253 Jan 13 16:42 UniSTS.aliases
 #	-rw-rw-r--  1  4140920 Feb 22 18:43 UniSTS_mouse.sts
 #	-rw-rw-r--  1  4576611 Feb 23 02:22 MRK_Dump2.rpt
 #	-rw-rw-r--  1  2549974 Feb 23 02:23 PRB_PrimerSeq.rpt
 #	-rw-rw-r--  1  4531489 Feb 23 02:23 MRK_Sequence.rpt
     #	 I note the UniSTS.aliases file is over twice as big as was in
     #	 Mm7 build.  I wonder what got into it ...
     #	What got into it was that it was completely broken.  It appeared
     #	to have a vast section of itself duplicated again in the file.
     #	It was cleaned up via:
     echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
     grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
     mv UniSTS.aliases UniSTS.aliases.broken
     mv uniqueSTS.aliases UniSTS.aliases
 
     # back to our work area, update the bed file
     #	to do this we need a new UniSTS_mouse.alias file
     # it is created by a combination of information from several
     # of the above files ! AND ! the previous stsInfoMouse.bed file
 
     cd /cluster/data/mm8/bed/STSmarkers/downloads
     cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.sh .
     cp -p /cluster/data/mm7/bed/STSmarkers/downloads/*.pl .
     #	There is a line in the fetchAllAliases.sh script that needs to
     #	be updated, it must point to the previous bed file:
     #   BEDFile=/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
     #	Next time, this should read:
     #   BEDFile=/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed
 
     #	This process has been captured in the script:
     #	/cluster/data/mm5/bed/STSmarkers/downloads/fetchAllAliases.sh
     # which uses a couple of perl scripts in that same directory.
     # briefly it is:
     
     # ./UniSTSParse.pl UniSTS_mouse.sts UniSTS.aliases > UniSTS_mouse_alias.0
     # grep MGI: UniSTS.aliases > MGI.aliases
     # ./stsInfoMouseParse.pl /cluster/store5/mouseMarker/stsInfoMouse.bed > \
     #	stsInfoAliases.txt
     # ./UniSTSParse.pl stsInfoAliases.txt UniSTS.aliases > stsInfo.aliases
     # cat UniSTS_mouse_alias.0 MGI.aliases stsInfo.aliases | sort -u \
     #    | sort -n > UniSTS_mouse.alias
 
     time ./fetchAllAliases.sh > fetchAllAliases.out 2>&1
 
     #	Here is a normal set of errors:
 # processing UniSTS_mouse.sts to find aliases
 # #       ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
 # #       2384
 # processing MGI.aliases
 # fetching existing aliases from previous stsInfoMouse.bed file
 # found 27648 potential errors in
 #	/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed
 # to see the errors: grep ERROR stsInfoAliases.txt
 # verify those stsInfoMouse.bed aliases with UniSTS.aliases
 
     #	those errors in the previous stsInfoMouse.bed file are an
     #	accumulation of errors from a long long time ago in this chain
     #	of processing.  Some day it might be nice to fix them, but they
     #	don't seem to bother anything, so they continue to be carried
     #	forward, and a couple of new ones are added with each assembly.
 
     # with that, we can create a new stsInfoMouse.bed file:
     #	Update the m m 7 directory name here to m m 8
     #	for the next build of m m 9
     cd /cluster/data/mm8/bed/STSmarkers
     /cluster/store5/mouseMarker/code/updateBed.pl \
 	/cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
 	downloads/MRK_Dump2.rpt downloads/PRB_PrimerSeq.rpt \
 	downloads/MRK_Sequence.rpt downloads/UniSTS_mouse.alias \
 	downloads/UniSTS_mouse.sts | sed -e "s/\t*$//" > newbedfile
 
     # Yontao updated /cluster/store5/mouseMarker/code/cleanInfo.pl 8/10/04
     /cluster/store5/mouseMarker/code/cleanInfo.pl newbedfile > stsInfoMouse.bed
 	
     # copy the stsInfoMouse.bed file from working dir to the marker
     #	info storage fold.  added 2 new steps by Yontao	
     #	be wary of the archive name here, check the directory and get
     #	the name right here.
     mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
     cp -p stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
 
     # comparing to previous, numbers increase slightly each time
     wc /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
     #	60440   801181  6871232 /cluster/store5/mouseMarker/stsInfoMouse.bed
     #	59843   794642  6802825 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
     #	58980   784786  6690105 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
     #	58493   778055  6524821 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
 
     # and from that, create new primer fa, epcr, etc:
     /cluster/store5/mouseMarker/code/luConvertPrimerToFa \
 	stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
     # the mouseC.fa file will be empty, should be more than last time
     wc mouse?.*
     #	     0       0       0 mouseC.fa
     #	305991  305937 6910111 mouseP.fa
     #	 34475  172467 2195057 mouseP.info
     #	340466  478404 9105168 total
 
     #	the equivalent Mm7 files:
     #      0       0       0 mouseC.fa
     # 300968  300914 6798466 mouseP.fa
     #  33838  169275 2153113 mouseP.info
     # 334806  470189 8951579 total
     #	the equivalent Mm6 files:
     #	     0       0       0 mouseC.fa
     #	293305  293251 6624638 mouseP.fa
     #	 32890  164528 2087271 mouseP.info
     #	326195  457779 8711909 total
     #	the equivalent Mm5 files:
     #	     0       0       0 mouseC.fa
     #	286740  286686 6474893 mouseP.fa
     #	 32232  161234 2044810 mouseP.info
     #	318972  447920 8519703 total
 
     #	copy the primers over to some filesystem close to the klusters
     #	and split them up to have a small number of sequences in one file
     
 
     mkdir /cluster/bluearc/mm8/stsMarkers
     cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers
     cd /cluster/bluearc/mm8/stsMarkers
     cp -p /cluster/data/mm8/11.ooc .
     mkdir split
     #	400 files for 34,475 sequences, == about 80 sequences per file
     faSplit sequence mouseP.fa 400 split/mm_
 
 
     # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
     #	This process could convert to a modern version of blat with the
     #	filters as described, for example, in the STS markers build in Hg18
 
     #  CLUSTER RUN FOR THE STS PRIMERS
     ssh kk
     mkdir /cluster/data/mm8/bed/STSmarkers/primer
     mkdir /cluster/data/mm8/bed/STSmarkers/ePCR
     cd /cluster/data/mm8/bed/STSmarkers/primer
     mkdir out
 
     #	interestingly, this blat2.2 binary did not function correctly
     #	when given nib files.  It has only about 1/4th of the number of
     #	alignments as it gets when it used fa files for the target
     #	sequence.
 
     ls -1S /cluster/bluearc/mm8/stsMarkers/split > primers.list
     ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
 
     cat << '_EOF_' > runBlat2.csh
 #!/bin/csh -fe
 set primer = /cluster/bluearc/mm8/stsMarkers/split/$1
 set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
 set ooc = /cluster/bluearc/mm8/stsMarkers/11.ooc
 set root2 = $2:r
 mkdir -p out/${root2}
 set out = $3
 
 /cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
         -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat2.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 primers.list chr.list template jobList
     para create jobList
     para try ... check ... push ... etc ...
 # Completed: 12104 of 12104 jobs
 # CPU time in finished jobs:    1075037s   17917.28m   298.62h   12.44d  0.034 y
 # IO & Wait Time:               7444257s  124070.95m  2067.85h   86.16d  0.236 y
 # Average job time:                 704s      11.73m     0.20h    0.01d
 # Longest finished job:           61869s    1031.15m    17.19h    0.72d
 # Submission to last job:        168538s    2808.97m    46.82h    1.95d
     #	some of the jobs got stuck for unknown reasons.  Had to find
     #	them and kill them on their nodes.  Their blat.2 process was
     #	stuck and would not kill.  Don't know what happened there.
 
     # on the file server
     ssh kkstore01
     cd /cluster/data/mm8/bed/STSmarkers/primer
     time pslSort dirs primers.raw.psl temp out/chr*
     #	-rw-rw-r--   1 586124177 Feb 26 21:28 primers.raw.psl
 
     #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
     #	should not be more than 100 bases different.
     #	This filters out about 1,028,202 alignments, or
     #	%17.4 = 100.0 * 1028202 / 5921712
     time pslSort dirs stdout temp out/chr* | awk -F"\t" '
 { if (((($13 - $12) - ($17 - $16)) > -100) &&
 	((($13 - $12) - ($17 - $16)) < 100)) {print}
 }
 ' > primers.psl.100
 
     rmdir temp
 
     wc -l *.100 *.psl
     #	5445367 primers.raw.psl
     #	4500528 primers.psl.100
     #	 944839 difference
 
     # a rough comparison with previous results:
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.100
     #	4893510  102763628  510563575 primers.psl.100
     wc primers.psl  (unfiltered, Mm7)
     #	5921712 124355891 636898117 primers.psl
     wc /cluster/data/mm7/bed/STSmarkers/primer/primers.psl
     #	5724127 120206606 615248041
     wc /cluster/data/mm5/bed/STSmarkers/primer/primers.psl
     #	5719969 120119288 590806241
     wc /cluster/data/mm4/bed/STSmarkers/primer/primers.psl
     #	5745617 120657896 592135728 
 
     # another kluster run for the ePCR
     ssh pk
     cd /cluster/data/mm8/bed/STSmarkers/ePCR
     ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
 
     #	pick up e-PCR source from
     #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
     #	version 2.3.1 11 Feb 2005
     #	Had to add the following to both re-PCR_main.cpp and
     #	e-PCR_main.cpp to get them to compile on kolossus:
 // max and min Copied from /usr/include/mysql/my_global.h
 #define max(a, b)       ((a) >? (b))
 #define min(a, b)       ((a) <? (b))
 
     mkdir out
     cat << '_EOF_' > runPCR
 #!/bin/csh -fe
 /cluster/bin/x86_64/e-PCR /cluster/data/mm8/bed/STSmarkers/mouseP.info \
 	/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
 '_EOF_'
     # << happy emacs
     chmod +x runPCR
 
     cat << '_EOF_' > template
 #LOOP
 ./runPCR $(path1) {check out line+ out/$(num1).epcr}
 #ENDLOOP
 '_EOF_'
     # the mouseP.info was created above
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check
     para push
     ... etc ...
     # STARTED 2006-02-27 16:24
     #	There is a single job that produces no output:
     ./runPCR chrX_random.fa out/30.epcr
     #	WARNING: 96 STSs have primer shorter than W
     #	WARNING: 21 STSs have ambiguities within W of 3' end
     #	Not sure what's up with that
 # Completed: 33 of 34 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      67601s    1126.69m    18.78h    0.78d  0.002 y
 # IO & Wait Time:                  1028s      17.13m     0.29h    0.01d  0.000 y
 # Average job time:                2080s      34.66m     0.58h    0.02d
 # Longest finished job:            5134s      85.57m     1.43h    0.06d
 # Submission to last job:          5134s      85.57m     1.43h    0.06d
 
     ssh kkstore01
     cd /cluster/data/mm8/bed/STSmarkers/ePCR
     # all those results become all.epcr
     cat out/*.epcr > all.epcr
 
     # comparing to previous results:
     wc -l all.epcr
     #	58088 all.epcr
     wc -l /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
     #	57709 /cluster/data/mm7/bed/STSmarkers/ePCR/all.epcr
     wc -l /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
     #	55871 /cluster/data/mm6/bed/STSmarkers/ePCR/all.epcr
     wc /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
     #	55677  222708 2945623 /cluster/data/mm5/bed/STSmarkers/ePCR/all.epcr
     wc /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
     #	74705  298820 3971712 /cluster/data/mm4/bed/STSmarkers/ePCR/all.epcr
     #	Mm4 seems to be out of whack
 
     cd /cluster/data/mm8/bed/STSmarkers/primer
 
     /cluster/bin/scripts/filterSTSPrimers \
     -mouse ../stsInfoMouse.bed primers.psl.100 \
         ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
 
     #  The output should show an increasing count:
     #	Reading name info
     #	Reading primer info
     #	Processing file
     #	100000
     #	200000
     #	300000
     #	...
     #	4500000
     #	Determining ePCR not found
     #
     wc -l primers.psl.filter.blat
     #	34026 primers.psl.filter.blat
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
     #	33986 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.blat
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
     #	33128 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.blat
     wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
     #	33476 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.blat
 
     # create accession_info.rdb
     touch empty_sequence.inf
     /cluster/bin/scripts/compileAccInfo -mouse \
 	/cluster/data/mm8 empty_sequence.inf
     # works with errors on missing randoms, etc...:
     # cat: /cluster/data/mm5/11/chr11_random.agp: No such file or directory
     # cat: /cluster/data/mm5/M/chrM_random.agp: No such file or directory
     mv accession_info.rdb accession_info.rdb.tmp
     /cluster/bin/scripts/sorttbl Chr Ord Start < accession_info.rdb.tmp > \
 	accession_info.rdb
     rm accession_info.rdb.tmp
     # comparing results to previous
     #	Continuing the trend that began with Mm7, the numbers in
     #	accession_info.rdb continue to decrease.  Even Mm8 has much less
     #	fragments than did mm7:
     #	e.g.:
     [hiram@kkstore01 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
     #	21910 total
     [hiram@kkstore01 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
     #	70125 total
     [hiram@kkstore01 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
     #	170812 total
 
     wc -l accession_info.rdb
     #	20385 accession_info.rdb
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
     #	44046  484510 3112816 accession_info.rdb
     wc /cluster/data/mm7/bed/STSmarkers/primer/accession_info.rdb
     #	93052 1023576 6824900 accession_info.rdb
     wc /cluster/data/mm5/bed/STSmarkers/primer/accession_info.rdb
     #	131845 1450299 9681940
     wc /cluster/data/mm4/bed/STSmarkers/primer/accession_info.rdb
     #	86935  956289 6374930 
 
     # creates epcr.not.found.nomatch and epcr.not.found.psl
     #	/cluster/bin/scripts/epcrToPsl
     #	Fixed this script (in mm7) to make it not look for contigs in the usual
     #	manner, we don't have those for this assembly	
     sed -e "s/mm7/mm8/g" /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl \
 	> ./epcrToPsl
     chmod +x epcrToPsl
     ./epcrToPsl -mouse \
 	epcr.not.found ../mouseP.info \
 	accession_info.rdb /cluster/data/mm8
 
     # Comparing results to previous:
     wc -l epcr*
     #	 501 epcr.not.found
     #	   0 epcr.not.found.nomatch
     #	 501 epcr.not.found.psl
     #	 158 epcrToPsl
     #	1160 total
 
     # Mm7 wc epcr*
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
     #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
     #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
     #	1106 total
 
     # Mm6 wc epcr*
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
     #	 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
     #	  63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
     #	 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
     #	 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
     #	1097 total
 
     cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
     wc -l primers.psl.filter
     #	34527 primers.psl.filter
 
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
     #	34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
 
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
     #	33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
 
     wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
     # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
 
     # create primers.psl.filter.lifted.initial
     #	if you do not run with scripts in your path, add the PATH business
     PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/extractPslInfo \
 	primers.psl.filter
     wc -l  primers.psl.filter.initial
     #	34513 primers.psl.filter.initial
 
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
     #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
     #	33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
     wc -l \
        /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
     # 33689 
 
     # create primers.psl.filter.lifted.initial.acc
     /cluster/bin/scripts/findAccession -agp \
 	-mouse primers.psl.filter.initial /cluster/data/mm8
     #	it complains about missing _random items, it is OK
     wc -l primers.psl.filter.initial.acc
     #	34513 primers.psl.filter.initial.acc
 
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial.acc
     #	34443
 
     # this needs to be -rat as that specifies how to scan the
     # stsInfoMouse.bed file and it does not work if you use -mouse
     /cluster/bin/scripts/getStsId -rat \
 	../stsInfoMouse.bed  primers.psl.filter.initial.acc \
 	| sort -k4,4n > primers.final
     wc -l primers.final
     #	34513 primers.final
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.final
     #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.final
 
     cd /cluster/data/mm8/bed/STSmarkers
     # stsMarkers.final is empty for mouse
     touch stsMarkers.final dummy
     #	if you do not run with scripts in your path, add the PATH business
     PATH=/cluster/bin/scripts:$PATH /cluster/bin/scripts/combineSeqPrimerPos \
 	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
     wc -l stsMarkers_pos.rdb
     #	33075 stsMarkers_pos.rdb
     wc -l /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb
     #	32869 /cluster/data/mm7/bed/STSmarkers/stsMarkers_pos.rdb
 
     wc -l /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
     #	31889 /cluster/data/mm6/bed/STSmarkers/stsMarkers_pos.rdb
     wc -l /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
     #	32085 /cluster/data/mm5/bed/STSmarkers/stsMarkers_pos.rdb
     wc -l /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
     #	31270 /cluster/data/mm4/bed/STSmarkers/stsMarkers_pos.rdb
 
     /projects/cc/hg/ytlu/bin/script/perl/createStsBed \
 	stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
 	| sort -k1,1 -k2,2n > stsMapMouse.bed
     #	Fixup --- 2006-04-12 - Hiram - it was found that column 12 had blanks
     #	as the first character of the field.  This isn't what is needed
     #	here.  Let's take those blanks out, turns out these were the
     #	only blanks in the file:
     mv stsMapMouse.bed stsMapMouse_withBlanks.bed
     sed -e "s/ //" stsMapMouse_withBlanks.bed > stsMapMouse.bed
 
     wc stsMapMouse.bed
     #	29888  308263 2087726 stsMapMouse.bed
     wc /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed
     #	29079  301678 2097544 stsMapMouse.bed
     wc /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
     #	29069  301535 2123622 /cluster/data/mm5/bed/STSmarkers/stsMapMouse.bed
 
     #  loading STS markers tables
     ssh hgwdev
     cd /cluster/data/mm8/bed/STSmarkers
     cp -p /cluster/data/mm7/bed/STSmarkers/ucscAlias.pl .
     ./ucscAlias.pl stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
     #	this does leave messages in ucscStsAlias.warnings but they seem
     #	to be very similar to Mm6 with just a few new ones
      
     wc ucscStsAlias.tab  (after applying filter to primers.psl above)
     #	144570  433667 3366815 ucscStsAlias.tab
     wc ucscStsAlias.tab  (before applying filter to primers.psl above)
     #	144570  433667 3366815 ucscStsAlias.tab
     wc /cluster/data/mm7/bed/STSmarkers/ucscStsAlias.tab
     #	141585  424725 3284106 ucscStsAlias.tab
     wc /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
     # 126624  379859 3037850 /cluster/store6/mm5/bed/STSmarkers/ucscStsAlias.tab
      
     #	Use the drop tables if reloading
     #	hgsql -e "drop table stsAlias;" mm8
     hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
     hgsql -e \
 	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
     #   reloaded stsMapMouseNew 2006-04-12 to remove blanks in col 12 - Hiram
     #	hgsql -e "drop table stsMapMouseNew;" mm8
     hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
     hgsql -e \
 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
     #	hgsql -e "drop table stsInfoMouseNew;" mm8
     hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
     hgsql -e \
      'load data local infile "stsInfoMouse.bed" into table stsInfoMouseNew;' mm8
 
     hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
     # load of all_sts_primer did not go as planned: 34527 record(s), 0
     # row(s) skipped, 19 warning(s) loading primer/primers.psl.filter
 
     # load primer sequences	
     mkdir /gbdb/mm8/stsMarker
     ln -s /cluster/data/mm8/bed/STSmarkers/mouseP.fa \
 	/gbdb/mm8/stsMarker/mouseP.fa
     # PLEASE NOTE THAT THE If you are going to reload this business, use the
     #	-replace option on this hgLoadSeq
     #	hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
     # otherwise there will be a problem that the seq and extFile tables 
     # will be out of sync. 
     hgLoadSeq mm8 /gbdb/mm8/stsMarker/mouseP.fa
     #  Adding /gbdb/mm8/stsMarker/mouseP.fa
     #  33838 sequences
 
     featureBits mm8 all_sts_primer
     #	3746196 bases of 2567283971 (0.146%) in intersection
     featureBits mm7 all_sts_primer
     #	3757119 bases of 2583394090 (0.145%) in intersection
     featureBits mm6 all_sts_primer
     #	3677372 bases of 2597150411 (0.142%) in intersection
     featureBits mm8 stsMapMouseNew
     #	4801964 bases of 2567283971 (0.187%) in intersection
     featureBits mm7 stsMapMouseNew
     #	4805958 bases of 2583394090 (0.186%) in intersection
     featureBits mm6 stsMapMouseNew
     #	4638338 bases of 2597150411 (0.179%) in intersection
 
     hgsql -N mm8 -e "select count(*) from stsAlias;"
     #	141981
     hgsql -N mm7 -e "select count(*) from stsAlias;"
     #	140649
     hgsql -N mm7 -e "select count(*) from stsAlias;"
     #	137738
     hgsql -N mm5 -e "select count(*) from stsAlias;"
     #	122944
     hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
     #	60440
     hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
     #	59843
     hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
     #	58980
     hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
     #	58493
 
     #	compare old and new name lists:
     awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
     awk '{print $4}' /cluster/data/mm7/bed/STSmarkers/stsMapMouse.bed | \
 	sort -u > mm7.nameList
     comm -12 mm?.nameList | wc -l
     #	28253   <- 28,253 names in common
     comm -23 mm7.nameList mm8.nameList | wc -l
     #	174     <- 174 unique to mm7 list
     comm -13 mm7.nameList mm8.nameList | wc -l
     #	445     <- 445 unique to mm8 list
 
     #	previously, Mm6 vs Mm7:
     #	27320   <- 27,320 names in common
     #	188     <- 188 unique to mm6 list
     #	1107    <- 1,107 unique to mm7 list
 
 ####################################################################################
 # BUILD KNOWN GENES TABLES (STARTED 2/25/06, PART I DONE 2/27/06 Fan)
 
 # First build protein databases, sp060115 and proteins060115
 # See makeProteins060115.doc for details.
 
 # Create working subdirectories and temporary databases (kgMm8A)
 
   ssh hgwdev
   mkdir /cluster/store9/kg
   cd /cluster/store9/kg
   mkdir kgMm8A  
   ln -s /cluster/store9/kg/kgMm8A /cluster/store6/kgDB/bed/kgMm8A
   ln -s /cluster/store9/kg/kgMm8A /cluster/data/mm8/bed/kgMm8A
 
   hgsql mm8 -e "create database kgMm8A"   
   hgsql mm8 -e "create database kgMm8ATemp"
 
   mkdir /cluster/bluearc/kgDB/kgMm8A
   mkdir /cluster/bluearc/kgDB/kgMm8A/protBlat
   ln -s /cluster/bluearc/kgDB/kgMm8A/protBlat /cluster/store9/kg/kgMm8A/protBlat
   cd /cluster/store9/kg/kgMm8A/protBlat
 
 # Get all mouse protein sequences
 
   hgsql -N sp060115 -e \
   'select p.acc, p.val from protein p, accToTaxon x where x.taxon=10090 and p.acc=x.acc'\
   |awk '{print ">" $1;print $2}' >mouseProt.fa
 
   hgsql -N sp060115 -e \
   'select v.varAcc, p.val from varAcc v, protein p, accToTaxon x where v.parAcc = p.acc and x.taxon=10090   and v.parAcc=x.acc'\
   |awk '{print ">" $1;print $2}' \
   >mouseVarProt.fa
 
 # append var proteins to mouseProt.fa
   cat mouseVarProt.fa >>mouseProt.fa
 
 # Prepare and perform cluster run for protein/genome alignment
 
   ssh pk
   cd /cluster/data/mm8/bed/kgMm8A/protBlat
   mkdir prot
   faSplit sequence mouseProt.fa 2000 prot/prot
   ls /cluster/bluearc/kgDB/kgMm8A/protBlat/prot/* > prot.lis
 
   ssh hgwdev
   cd /cluster/data/mm8/bed/kgMm8A/protBlat
   hgsql mm8 -N -e 'select chrom from chromInfo' > chrom.lis
   exit
   
   cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/x86_64/blat -t=dnax -q=prot /cluster/data/mm8/nib/$(path1).nib $(path2) {check out line+ /cluster/bluearc/kgDB/kgMm8A/protBlat/result/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 
   mkdir result
   gensub2 chrom.lis prot.lis gsub jobList
 
   para create jobList
   para try
   para check
   para push
   para check ...
 
 # started 8:15 AM 2/25/06, done 3:12 AM 2/26/06.
 # Two jobs crashed due to empty result, push again and finished OK in a few minutes.
 
 # Completed: 67354 of 67354 jobs
 # CPU time in finished jobs:   12580047s  209667.46m  3494.46h  145.60d  0.399 y
 # IO & Wait Time:                237270s    3954.49m    65.91h    2.75d  0.008 y
 # Average job time:                 190s       3.17m     0.05h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:           19991s     333.18m     5.55h    0.23d
 # Submission to last job:         68128s    1135.47m    18.92h    0.79d
 
 # collect BLAT results
 
    pslSort -nohead dirs raw.psl temp result
    pslReps -nohead -minCover=0.80 -minAli=0.80 -nearTop=0.002 raw.psl protBlat.psl /dev/null
 
    ssh hgwdev
    cd /cluster/bluearc/kgDB/kgMm8A/protBlat
    hgLoadPsl mm8 protBlat.psl
 
 # create all_mrna.psl and tight_mrna.psl
 
    hgsql mm8 -N -e "select * from all_mrna" |cut -f 2-22 >all_mrna.psl
 
    pslReps -minCover=0.40 -minAli=0.97 -nearTop=0.002 \
            all_mrna.psl tight_mrna.psl /dev/null
 
 # Save a copy of the following mm8 tables, to be used later to construct 
 # kgMore and kgEvenmore
 
 all_mrna
 gbCdnaInfo
 gbExtFile
 gbLoaded
 gbSeq
 gbStatus
 refFlat
 refGene
 refLink
 refSeqAli
 refSeqStatus
 refSeqSummary
 xenoMrna
 xenoRefFlat
 xenoRefGene
 xenoRefSeqAli
 
 # Use overlapSelect to get protein and mRNA alignment overlaps   
    overlapSelect  -statsOutput  -dropped=protOut.psl -overlapThreshold=0.90 \
    -selectFmt=psl -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.stat
 
    overlapSelect  -mergeOutput  -dropped=protOut.psl -overlapThreshold=0.90 -selectFmt=psl \
    -inFmt=psl tight_mrna.psl  protBlat.psl protMrna.out
 
 # Create protein/mRNA pair and protein lists
    cut -f 10,31 protMrna.out|sort -u >spMrna.tab
    cut -f 10    protMrna.out|sort -u >protein.lis
    cp -p protein.lis /cluster/data/mm8/bed/kgMm8A
 
 # Load spMrna.tab into spMrna table in temp DB.
    hgsql kgMm8ATemp < ~/src/hg/lib/spMrna.sql
    hgsql kgMm8ATemp -e 'load data local infile "spMrna.tab" into table spMrna'
    hgsql kgMm8ATemp -e 'create index mrnaID on spMrna(mrnaID)'
 
 # Prepare and perform cluster run of protein/mRNA alignment
 
 # Get mRNA fa file.
    cd /cluster/data/mm8/bed/kgMm8A
    /cluster/data/genbank/bin/i386/gbGetSeqs -native -db=mm8 \
    -gbRoot=/cluster/data/genbank genbank mrna mrna.fa
 
 # Create mrnaSeq table in kgMm8ATemp DB.
 
    faToTab mrna.fa mrnaSeq.tab
 
    hgsql kgMm8ATemp -e 'drop table mrnaSeq'
    hgsql kgMm8ATemp <~/src/hg/lib/mrnaSeq.sql
    hgsql kgMm8ATemp -e 'load data local infile "mrnaSeq.tab" into table mrnaSeq'
 
 # Prepare files for cluster run
 
    cd /cluster/bluearc/kgDB/kgMm8A
    ~/src/hg/protein/KG2.sh kgMm8A mm8 060115
 
 # Perform cluster run of protein/mRNA alignment
    ~/src/hg/protein/KG3.sh kgMm8A mm8 060115
 
 # Collect cluster run results
    cd kgBestMrna
 
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
 # create do1 with the following 2 lines:
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protMrnaRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments
    pslReps -nohead -singleHit -minAli=0.9 protMrnaRaw.psl protMrnaBlat.psl /dev/null
    cut -f 10,14 protMrnaBlat.psl |sort -u >protMrna.lis
    wc protMrna.lis
 
 # Load BLAT results into temp DB.
    ssh hgwdev
    cd /cluster/store9/kg/kgMm8A/kgBestMrna
    hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaBlat.sql
    hgsql kgMm8ATemp -e 'load data local infile "protMrnaBlat.psl" into table protMrnaBlat'
    hgsql kgMm8ATemp -e 'create index tName on protMrnaBlat(tName)'
 
 # Create CDS files from protein/mRNA alignment results.
    hgsql kgMm8ATemp -N -e \
    'select qName,"_",tName,tStart+1,":",tEnd+3 from protMrnaBlat order by qName,tName,tEnd-tStart desc'\
    |sed 's/\t_\t/_/g'|sed 's/\t:\t/../g' >protMrna.cds
 
 # Create protMrna.psl with proteinID_mrnaID as query ID.
    cut -f 22-30 ../protBlat/protMrna.out > j1.tmp
    cut -f 32-42 ../protBlat/protMrna.out > j2.tmp
    cut -f 10,31 ../protBlat/protMrna.out|sed -e 's/\t/_/g' >j3.tmp
    paste j1.tmp j3.tmp j2.tmp >protMrna.psl
    rm j1.tmp j2.tmp j3.tmp
 
 # Run mrnaToGene to create protMrna.gp
    bash
    mrnaToGene -cdsFile=protMrna.cds protMrna.psl protMrna.gp 2>protMrna.err >protMrna.log
    exit
 
 # move kgBestMrna to /san/sanvol1 to save space on store9
 
    mv /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna
    ln -s /san/sanvol1/scratch/fan/mm8/kgMm8A/kgBestMrna/clusterRun \
    /cluster/store9/kg/kgMm8A/kgBestMrna/clusterRun
 
 # Prepare refGene and all_mrna gp files.
 
    cd ..
    cp -p base/refGene.tab ref.gp
 
    hgsql mm8 -N -e \
    'select gbCdnaInfo.acc,cds.name from gbCdnaInfo,cds,all_mrna where all_mrna.qName=gbCdnaInfo.acc and   gbCdnaInfo.cds=cds.id' \
    |sort -u > all_mrna.cds
 
    cat base/all_mrna.tab |cut -f 2-22 >all_mrna.psl
    bash
    mrnaToGene -cdsFile=all_mrna.cds all_mrna.psl all_mrna.gp 2>all_mrna.err > all_mrna.log
    exit
 
 # Align proteins to RefSeq.
 
    overlapSelect -inCds -statsOutput -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat/protBlat.psl ref.gp ref.stat
    overlapSelect -inCds -dropped=refOut1.gp -overlapThreshold=0.90 -selectFmt=psl -inFmt=genePred\
    protBlat/protBlat.psl ref.gp protRef.gp
 
    overlapSelect -mergeOutput -selectCds -dropped=protOut1.psl -overlapThreshold=0.80 -inFmt=psl\
    -selectFmt=genePred ref.gp protBlat/protBlat.psl protRef.out
 
    cut -f 10,22 protRef.out | sort -u >spRef.tab
    cut -f 10 protRef.out    | sort -u >protRef.lis
 
    hgsql kgMm8ATemp -e 'drop table spRef'
    hgsql kgMm8ATemp <~/src/hg/lib/spRef.sql
    hgsql kgMm8ATemp -e 'load data local infile "spRef.tab" into table spRef'
 
 # Prepare and perform cluster runs for protein/RefSeq alignments
 
    ~/src/hg/protein/KGRef2.sh kgMm8A mm8 060115
    ~/src/hg/protein/KGRef3.sh kgMm8A mm8 060115
 
    cd kgBestRef
    ls out | sed -e 's/prot/do1 prot/g' >doall
 
    cat << '_EOF_' > do1
 echo processing $1
 cat out/$1/*.out >>protRefRaw.psl
 '_EOF_'
 
    chmod +x do*
    doall
 
 # Filter out low quality alignments.
    pslReps -nohead -singleHit -minAli=0.9 protRefRaw.psl protRefBlat.psl /dev/null
    cut -f 10,14 protRefBlat.psl |sort -u >protRef.lis
    wc protRef.lis
 
    hgsql kgMm8ATemp -e 'drop table protRefBlat'
    hgsql kgMm8ATemp < ~/src/hg/lib/protRefBlat.sql
    hgsql kgMm8ATemp -e 'load data local infile "protRefBlat.psl" into table protRefBlat'
    hgsql kgMm8ATemp -e 'create index tName on protRefBlat(tName)'
 
 # Run gene-check to filter out invalid gp entries
    cd /cluster/data/mm8/bed/kgMm8A
    cat ref.gp kgBestMrna/protMrna.gp all_mrna.gp >kgCandidate0.gp
    gene-check  -incl-ok -ok-genepred-out kgCandidate0.passed.gp -nib-dir /cluster/data/mm8/nib kgCandidate0.gp kgCandidate0.check
 
    hgsql kgMm8ATemp -e 'drop table kgCandidate0'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate0.sql 
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidate0.gp" into table kgCandidate0'
 
    hgsql kgMm8ATemp -e 'drop table geneCheck'
    hgsql kgMm8ATemp < ~/src/hg/lib/geneCheck.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidate0.check" into table geneCheck ignore 2 lines'
 
 # Run kgCheck to get all KG candidates that pass the KG gene check criteria
 
    kgCheck kgMm8ATemp mm8 kgCandidate0 geneCheck kgCandidate.tab
    hgsql kgMm8ATemp -e  'drop table kgCandidate'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidate.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidate.tab" into table kgCandidate'
    hgsql kgMm8ATemp -e 'create index alignID on kgCandidate(alignID)'
 
 # Construct the kgCandidateX table that has alignID in the name field. 
    cut -f 2-10 kgCandidate.tab >j2.tmp
    cut -f 11 kgCandidate.tab >j1.tmp
    paste j1.tmp j2.tmp >kgCandidateX.tab
 
    hgsql kgMm8ATemp -e  'drop table kgCandidateX'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateX.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidateX.tab" into table kgCandidateX'
 
 # Score protein/mRna and protein/RefSeq alignments
 
    kgResultBestMrna2 060115 kgMm8ATemp mm8 protMrnaBlat|sort -u >protMrnaBlatScore.tab
    kgResultBestRef2  060115 kgMm8ATemp mm8 protRefBlat|sort -u >protRefScore.tab
 
 # Combine scoring results and load them into temp DB.
    cat protMrnaBlatScore.tab protRefScore.tab >protMrnaScore.tab
    hgsql kgMm8ATemp -e 'drop table protMrnaScore'
    hgsql kgMm8ATemp < ~/src/hg/lib/protMrnaScore.sql
    hgsql kgMm8ATemp -e 'load data local infile "protMrnaScore.tab" into table protMrnaScore'
    hgsql kgMm8ATemp -e 'create index mrnaAcc on protMrnaScore(mrnaAcc)'
 
 # Run kgGetCds to get CDS structure of each gene
 
    kgGetCds kgMm8ATemp 060115 kgCandidateX jY.tmp
 # G171564 does not have cds.
 # G171565 does not have cds.
    cat jY.tmp |sort -u >kgCandidateY.tab
    rm jY.tmp
    hgsql kgMm8ATemp -e  'drop table kgCandidateY'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateY.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidateY.tab" into table kgCandidateY'
 
 # Run kgPickPrep to replace long cds structure string with cdsId.
    kgPickPrep kgMm8ATemp kgCandidateZ.tab
    hgsql kgMm8ATemp -e  'drop table kgCandidateZ'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgCandidateZ.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgCandidateZ.tab" into table kgCandidateZ'
    hgsql kgMm8ATemp -e 'create index cdsId on kgCandidateZ(cdsId)'
 
 # Run kgPick to pick the representative a mrna/protein pair for each unique CDS structure.
 
    kgPick kgMm8ATemp mm8 sp060115 kg3.tmp dupSpMrna.tmp
    sort -u dupSpMrna.tmp >dupSpMrna.tab
 
 # Create put back list
 
 # gbGetSeqs2, a modified version of gbGetSeqs output the RefSeq IDs at the beginning of each output line.
 
    gbGetSeqs2 -gbRoot=/cluster/data/genbank db=mm8 -get=ra RefSeq mrna ref.ra
    cat ref.ra | sed -e 's/ /\t/' | sort -u >refRa.tab
 
    hgsql mm8 -e  'drop table refRa'
    hgsql mm8 < ~/src/hg/lib/refRa.sql
    hgsql mm8 -e  'load data local infile "refRa.tab" into table refRa ignore 1 lines'
 
     hgsql mm8 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="selenocysteine" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and  r3.val="Mus musculus"' \
     >kgPutBack2.tab
 
     hgsql mm8 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%ribosomal frameshift%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
     >>kgPutBack2.tab
 
     hgsql mm8 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="cno" and r.val like "%non-AUG%" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
     >>kgPutBack2.tab
 
     hgsql mm8 -N -e \
     'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="translExcept" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
     >>kgPutBack2.tab 
 
     hgsql mm8 -N -e \
 'select r.acc, r.attr, r.val from refRa r, refRa r2, refRa r3 where r.attr="exception" and r.acc=r2.acc and r2.attr="rss" and r2.val="rev" and r2.val="rev" and r3.acc=r.acc and r3.attr="org" and r3.val="Mus musculus"' \
     >>kgPutBack2.tab
 
    hgsql kgMm8ATemp -e 'drop table kgPutBack2'
    hgsql kgMm8ATemp < ~/src/hg/lib/kgPutBack2.sql
    hgsql kgMm8ATemp -e  'load data local infile "kgPutBack2.tab" into table kgPutBack2'
 
    kgPutBack kgMm8ATemp mm8 sp060115 kgPutBack2 kgPutBack2.gp
 # No matching protein found for NM_008523.
 # No matching protein found for NM_194444.
 # No matching protein found for NM_206941.
 
 # Sort KG genes to make the kg4.gp table file.
    cat kgPutBack2.gp kg3.tmp > kg4.tmp
    ~/kent/src/hg/protein/sortKg.pl kg4.tmp >knownGene.tab
 
    hgsql kgMm8ATemp -e  'drop table knownGene'
    hgsql kgMm8ATemp < ~/src/hg/lib/knownGene.sql
    hgsql kgMm8ATemp -e  'load data local infile "knownGene.tab" into table knownGene'
 
 # Load data into mm8 knownGene table.
    hgsql mm8 -e  'drop table knownGene'
    hgsql mm8 < ~/src/hg/lib/knownGene.sql
    hgsql mm8 -e  'load data local infile "knownGene.tab" into table knownGene'
   
 # Load dupSpMrna table after knownGene table is loaded so that joinerCheck does not complain.
 
    hgsql mm8 -e  'drop table dupSpMrna'
    hgsql mm8 < ~/src/hg/lib/dupSpMrna.sql
    hgsql mm8 -e  'load data local infile "dupSpMrna.tab" into table dupSpMrna'
 
 # Perform analysis on KG
 
 nice featureBits mm8 knownGene
 # 54684224 bases of 2567283971 (2.130%) in intersection
 nice featureBits mm8 knownGene:cds
 # 28459053 bases of 2567283971 (1.109%) in intersection
   
 nice featureBits mm8 refGene
 # 46256526 bases of 2567283971 (1.802%) in intersection
 nice featureBits mm8 refGene:cds
 # 27221018 bases of 2567283971 (1.060%) in intersection
   
 nice featureBits mm8 refGene knownGene
 # 43441486 bases of 2567283971 (1.692%) in intersection
 nice featureBits mm8 refGene:cds knownGene:cds
 # 25164531 bases of 2567283971 (0.980%) in intersection
 
 nice featureBits mm7 knownGene
 # 53165921 bases of 2583394090 (2.058%) in intersection
 nice featureBits mm7 knownGene:cds
 # 27531524 bases of 2583394090 (1.066%) in intersection
  
 nice featureBits mm7 refGene
 # 46425940 bases of 2583394090 (1.797%) in intersection
 nice featureBits mm7 refGene:cds
 # 27319308 bases of 2583394090 (1.057%) in intersection
  
 nice featureBits mm7 refGene knownGene
 # 41777202 bases of 2583394090 (1.617%) in intersection
 nice featureBits mm7 refGene:cds knownGene:cds
 # 24297646 bases of 2583394090 (0.941%) in intersection
 
 # Build knownGeneMrna and knownGenePep tables.
 
    kgPepMrna kgMm8ATemp mm8 060115
    hgsql mm8 -e  'drop table knownGeneMrna'
    hgsql mm8 < ~/src/hg/lib/knownGeneMrna.sql
    hgsql mm8 -e  'load data local infile "knownGeneMrna.tab" into table knownGeneMrna'
    hgsql mm8 -e  'drop table knownGenePep'
    hgsql mm8 < ~/src/hg/lib/knownGenePep.sql
    hgsql mm8 -e  'load data local infile "knownGenePep.tab" into table knownGenePep'
 
 # Build kgXref table
 
    kgXref2 kgMm8ATemp 060115 mm8
 
    hgsql mm8 -e  'drop table kgXref'
    hgsql mm8 < ~/src/hg/lib/kgXref.sql
    hgsql mm8 -e  'load data local infile "kgXref.tab" into table kgXref'
 
 # Build spMrna table
 
    hgsql mm8 -N -e 'select proteinID, name from knownGene' >kgSpMrna.tab
 
    hgsql mm8 -e  'drop table spMrna'
    hgsql mm8 <~/src/hg/lib/spMrna.sql
    hgsql mm8 -e 'load data local infile "kgSpMrna.tab" into table spMrna'
 
 # Build kgProtMap table
 
     ssh hgwdev
     cd /cluster/store9/kg/kgMm8A
     ln -s protBlat/tight_mrna.psl .
     ~/src/hg/protein/kgProtMap2.sh kgMm8A mm8 060115
 
 #####################################
 # Build alias tables. (DONE 2/28/06, Fan)		
 
    ssh hgwdev
    cd /cluster/store9/kg/kgMm8A
    mkdir alias
    cd alias
    kgAliasM mm8 proteins060115
 
 #	kgAliasKgXref reads from mm8.knownGene.proteinID,
 #	mm8.knownGene.name, mm8.kgXref.geneSymbol
 #	to create kgAliasKgXref.tab
 
    kgAliasKgXref mm8
 
 #	kgAliasRefseq reads from mm8.knownGene.name,
 #	mm8.knownGene.proteinID, mm8.kgXref.refseq
 #	to create kgAliasRefseq.tab
 
    kgAliasRefseq mm8
 
    hgsql sp060115 -N -e 'select name,gene.val from mm8.knownGene,displayId,gene where displayId.val=proteinID and displayId.acc=gene.acc' \
    | sort -u  > kgAliasP.tab
 
    hgsql mm8 -N -e 'select name, name from knownGene' >kgAliasDup.tab
    hgsql mm8 -N -e 'select mrnaID, dupMrnaID from dupSpMrna' >>kgAliasDup.tab
    
    cat kgAliasM.tab kgAliasRefseq.tab kgAliasKgXref.tab kgAliasP.tab kgAliasDup.tab| \
    sort |uniq > kgAlias.tab
 
    hgsql -e "drop table kgAlias;" mm8 
    hgsql mm8 < ~/kent/src/hg/lib/kgAlias.sql
    hgsql mm8 -e 'LOAD DATA local INFILE "kgAlias.tab" into table kgAlias' 
 
 #	kgProtAlias reads from mm8.knownGene.name,
 #	mm8.knownGene.proteinID, mm8.knownGene.alignID,
 #	proteins060115.spXref3.accession, proteins060115.spSecondaryID, proteins060115.pdbSP.pdb
 #	to create kgProtAlias.tab#
 
    kgProtAlias mm8 060115
 
    hgsql mm8 -N -e \
    'select kgID, spDisplayID, protAcc from kgXref where protAcc != ""'\
    | sort -u >kgProtAliasNCBI.tab
 
 # include variant splice protein IDs
    
    hgsql mm8 -N -e \
    'select name, proteinID, parAcc from knownGene,sp060115.varAcc where varAcc=proteinID'\
    |sort -u >kgProtAliasDup.tab
 
 # include duplicate protein IDs from dupSpMrna table
    hgsql mm8 -N -e \
    'select name, knownGene.proteinID, dupProteinID from knownGene, dupSpMrna where name=mrnaID'\
    |sort -u >>kgProtAliasDup.tab
 
 # catch parent acc from dupProteinID too
    hgsql mm8 -N -e\
    'select name, knownGene.proteinID, parAcc from knownGene,dupSpMrna,sp060115.varAcc where name=mrnaID and dupProteinID=varAcc.varAcc'\
    |sort -u >>kgProtAliasDup.tab
     cat kgProtAliasNCBI.tab kgProtAlias.tab kgProtAliasDup.tab | sort -u > kgProtAliasAll.tab
 
     echo "`date` creating table kgProtAlias"
     hgsql mm8 -e "drop table kgProtAlias;"
     hgsql mm8 <~/src/hg/lib/kgProtAlias.sql; 
     hgsql mm8 -e 'LOAD DATA local INFILE "kgProtAliasAll.tab" into table kgProtAlias;'  
 
 # Build kgSpAlias table
 
     hgsql mm8 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
     hgsql mm8 -e \
     'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
     >>j.tmp
     cat j.tmp|sort -u |grep -v 'kgID' >mm8.kgSpAlias.tab
     rm j.tmp
 
     hgsql mm8 -e 'drop table kgSpAlias';
     hgsql mm8 < ~/src/hg/lib/kgSpAlias.sql
     hgsql mm8 -e 'load data local infile "mm8.kgSpAlias.tab" into table kgSpAlias'
 
 #############################################################################
 # 17-WAY VAR_MULTIZ - ALIGNMENTS (DONE - 2006-02-28 - 2006-03-02 - Hiram)
 #	Re-DONE with panTro2 in place of panTro1 - 2006-04-19 - Hiram)
 #	And again with xenTro1 in place of xenTro2 - 2006-04-24
 #	And again with danRer4 in place of danRer3 - 2006-05-02
     ssh kkstore04
     mkdir /cluster/data/mm8/bed/multiz17way
     cd /cluster/data/mm8/bed/multiz17way
 
     #	create tree diagram to guide work below.
     #	This tree was constructed from one that Adam is using for
     #	ENCODE work and a 27-way alignment.  Took that file and
     #	removed some of the entries, adding together the appropriate
     #	distances.
 
     cat << '_EOF_' > 17way.nh
 (((((((((
 (human_hg18:0.006690,chimp_panTro2:0.007571):0.024272,
   macaque_rheMac2:0.0592):0.023960,
 
   ((rat_rn4:0.081728,mouse_mm8:0.077017):0.229273,
       rabbit_oryCun1:0.206767):0.1065):0.023026,
 
 (cow_bosTau2:0.159182,dog_canFam2:0.147731):0.039450):0.028505,
 
 armadillo_dasNov1:0.149862):0.015994,
 
 (elephant_loxAfr1:0.104891,tenrec_echTel1:0.259797):0.040371):0.218400,
 
 monodelphis_monDom4:0.371073):0.189124,
 
 chicken_galGal2:0.454691):0.123297,
 
 xenopus_xenTro2:0.782453):0.156067,
 
 ((tetraodon_tetNig1:0.199381,fugu_fr1:0.239894):0.492961,
     zebrafish_danRer4:0.782561):0.156067);
 '_EOF_'
     #	<< happy emacs
 
     /cluster/bin/phast/draw_tree 17way.nh > 17way.ps
     /cluster/bin/phast/all_dists 17way.nh > 17way.distances.txt
     grep -y mm8 17way.distances.txt | sort -k3,3n
     #	Print out that file for reference, and use the calculated
     #	distances in the table below to order the organisms and check
     #	the button order on the browser.  Zebrafish ends up before
     #	tetraodon and fugu on the browser despite its distance.
     #	And if you can fill in the table below entirely, you have
     #	succeeded in finishing all the alignments required.
     #
 #                         featureBits chainLink measures
 #                                           chainMm8Link   chain   linearGap
 #    distance                       on Mm8      on other   minScore
 #  1  0.1587 - rat rn4            (% 68.957)  (% 69.651)   3000     medium
 #  2  0.4677 - human hg18         (% 38.343)  (% 34.514)   3000     medium
 #  3  0.4686 - chimp panTro2      (% 37.549)  (% 33.614)   3000     medium
 #  4  0.4960 - macaque rheMac2    (% 34.718)  (% 33.170)   3000     medium
 #  5  0.5131 - rabbit oryCun1     (% 19.322)  (no swap )   3000     medium
 #  6  0.6142 - armadillo dasNov1  (% 16.825)  (no swap )   3000     medium
 #  7  0.6230 - dog canFam2        (% 32.281)  (% 34.255)   3000     medium
 #  8  0.6256 - elephant loxAfr1   (% 18.392)  (no swap )   3000     medium
 #  9  0.6344 - cow bosTau2        (% 26.832)  (% 24.293)   3000     medium
 # 10  0.7805 - tenrec echTel1     (% 11.412)  (no swap )   5000     loose
 # 11  1.0698 - opossum monDom4    (%  8.245)  (%  6.024)   5000     loose
 # 12  1.3425 - chicken galGal2    (%  2.552)  (%  5.414)   5000     loose
 # 13  1.7936 - frog xenTro2       (%  2.651)  (%  5.358)   5000     loose
 # 14  2.0157 - tetraodon tetNig1  (%  1.962)  (% 13.734)   5000     loose
 # 15  2.0562 - fugu fr1           (%  1.907)  (% 13.524)   5000     loose
 # 16  2.1059 - zebrafish danRer4  (%  2.105)  (%  3.576)   5000     loose
 
     cd /cluster/data/mm8/bed/multiz17way
     #	bash shell syntax here ...
     export H=/cluster/data/mm8/bed
     mkdir mafLinks
     for G in rn4 hg18 panTro2 rheMac2 oryCun1 dasNov1 canFam2 \
 	loxAfr1 bosTau2 echTel1 monDom4 galGal2 xenTro2 tetNig1 fr1 danRer4
     do
 	mkdir mafLinks/$G
 	if [ ! -d ${H}/blastz.${G}/mafNet ]; then
 	echo "missing directory blastz.${G}/mafNet"
 		exit 255
 	fi
 	ln -s ${H}/blastz.$G/mafNet/*.maf.gz ./mafLinks/$G
     done
 
     #	Copy MAFs to some appropriate NFS server for kluster run
     ssh kkstore04
     mkdir /san/sanvol1/scratch/mm8/multiz17way
     cd /san/sanvol1/scratch/mm8/multiz17way
     time rsync -a --copy-links --progress \
 	/cluster/data/mm8/bed/multiz17way/mafLinks/ .
 
     #	We have about 5.9 Gb of data here, takes ~ 10 minutes to copy
 
     mkdir penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/multiz penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/maf_project penn
     cp -p /cluster/bin/penn/multiz.v11.x86_64/multiz-tba/autoMZ penn
 
     # the autoMultiz cluster run
     ssh pk
     cd /cluster/data/mm8/bed/multiz17way/
 
     # create species list and stripped down tree for autoMZ
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
 	17way.nh > tmp.nh
     echo `cat tmp.nh` > tree-commas.nh
     echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
     sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
 
     #	the maf directory here is a symlink to a /cluster/store8 
     #	directory to even out the data load on store9 and store8 on kkstore04
     mkdir /cluster/store8/mm8/bed/multiz17way/maf
     ln -s /cluster/store8/mm8/bed/multiz17way/maf ./maf
     mkdir run
     cd run
 
     #	NOTE: you need to set the db properly in this script
 
     cat > autoMultiz << '_EOF_'
 #!/bin/csh -ef
 set db = mm8
 set c = $1
 set maf = $2
 set binDir = /san/sanvol1/scratch/$db/multiz17way/penn
 set tmp = /scratch/tmp/$db/multiz.$c
 set pairs = /san/sanvol1/scratch/$db/multiz17way
 rm -fr $tmp
 mkdir -p $tmp
 cp ../{tree.nh,species.lst} $tmp
 pushd $tmp
 foreach s (`cat species.lst`)
     set in = $pairs/$s/$c.maf
     set out = $db.$s.sing.maf
     if ($s == $db) then
 	continue
     endif
     if (-e $in.gz) then
 	zcat $in.gz > $out
     else if (-e $in) then
 	cp $in $out
     else
 	echo "##maf version=1 scoring=autoMZ" > $out
     endif
 end
 set path = ($binDir $path); rehash
 $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
 popd
 cp $tmp/$c.maf $maf
 rm -fr $tmp
 '_EOF_'
 # << happy emacs
     chmod +x autoMultiz
 
 cat  << '_EOF_' > template
 #LOOP
 autoMultiz $(root1) {check out line+ /cluster/store8/mm8/bed/multiz17way/maf/$(root1).maf}
 #ENDLOOP
 '_EOF_'
 # << happy emacs
 
     awk '{print $1}' /cluster/data/mm8/chrom.sizes > chrom.lst
     gensub2 chrom.lst single template jobList
     para create jobList
     # 34 jobs
     para try ... check ... push ... etc ...
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:     210573s    3509.55m    58.49h    2.44d  0.007 y
 # IO & Wait Time:                  4870s      81.17m     1.35h    0.06d  0.000 y
 # Average job time:                6337s     105.61m     1.76h    0.07d
 # Longest finished job:           17786s     296.43m     4.94h    0.21d
 # Submission to last job:         41755s     695.92m    11.60h    0.48d
 
     #	combine results into a single file for loading and gbdb reference
     ssh kkstore04
     cd /cluster/data/mm8/bed/multiz17way
     #	There used to be a mafFilter here with a minScore of 500, but it
     #	turns out that the scores in these maf files are pretty much
     #	useless.  They range from very large negatives to very large
     #	positives.
     time catDir maf > multiz17way.maf
     #	real    10m17.400s
     #	makes an 17 Gb file:
     #	-rw-rw-r--   1 17334936245 Apr 20 10:31 multiz17way.maf
 
     #	Create per-chrom individual maf files for downloads
     #	These are actually done after the annotation mafs are made
     ##	re-done with corrected annotated mafs 2007-03-28 - Hiram
     ssh kkstore04
     cd /cluster/data/mm8/bed/multiz17way
     mkdir mafDownloads
     time for M in anno/maf/chr*.maf
     do
 	B=`basename $M`
 	nice -n +19 cp -p ${M} mafDownloads/${B}
 	nice -n +19 gzip mafDownloads/${B}
 	echo ${B} done
     done
     #	real    59m16.415s
     cd mafDownloads
     md5sum *.gz > md5sum.txt
 
     #	deliver to downloads
     ssh hgwdev
     ln -s /cluster/data/mm8/bed/multiz17way/mafDownloads \
 	/usr/local/apache/htdocs/goldenPath/mm8/multiz17way
 
     # Load into database, actually annotation mafs are loaded later
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way
     mkdir /gbdb/mm8/multiz17way
     ln -s /cluster/data/mm8/bed/multiz17way/multiz17way.maf \
 	/gbdb/mm8/multiz17way
     time nice -n +19 hgLoadMaf mm8 multiz17way
     #	Loaded 11601035 mafs in 1 files from /gbdb/mm8/multiz17way
     #	real    27m29.960s
 
     time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
 	-maxSize=50000 mm8 multiz17waySummary multiz17way.maf
     #	Created 5782229 summary blocks from 65123362 components and
     #	11601035 mafs from multiz17way.maf
     #	real    32m34.791s
 
     # Dropped unused indexes (2006-05-09 kate)
     # NOTE: this is not required in the future, as the loader
     # has been fixed to not generate these indexes
     hgsql mm8 -e "alter table multiz17waySummary drop index chrom_2"
     hgsql mm8 -e "alter table multiz17waySummary drop index chrom_3"
 
     #	This was done for Mm7, same image can be reused
     # create tree image:
     #	cat << '_EOF_' > species.nh
 # ((((((human,(mouse,rat)),(dog,cow)),opossum),chicken),frog),(tetraodon,zebrafish))
 # '_EOF_'
 #    /cluster/bin/phast/draw_tree -b -s species.nh > species10.ps
     # photoshop to enhance, reduce the amount of whitespace to make it
     # smaller, then save as jpg
 #    cp species10.jpg /usr/local/apache/htdocs/images/phylo/Mm7_17way.jpg
 
     # creating upstream mafs (DONE - 2006-07-31 - Hiram)
     ssh hgwdev
     #	data data load balancing in the kkstore04 filesystems
     mkdir /cluster/store8/mm8/bed/multiz17way/upstreamMafs
     cd /cluster/data/mm8/bed/multiz17way
     ln -s /cluster/store8/mm8/bed/multiz17way/upstreamMafs ./upstreamMafs
     #	rebuilt 2007-12-21 to fix difficulty in mafFrags when species.lst
     #	did not have mm8 as the first one
 for S in 1000 2000 5000
 do
     echo "making upstream${S}.maf"
     nice -n +19 $HOME/bin/$MACHTYPE/featureBits -verbose=2 mm8 \
         refGene:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm8 multiz17way \
                 stdin stdout -orgs=species.lst \
         | gzip -c > upstreamMafs/upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/mm8/multiz17way
     ln -s /cluster/data/mm8/bed/multiz17way/upstreamMafs/upstream*.maf.gz .
 ############################################################################
 # ANNOTATE MULTIZ17WAY MAF AND LOAD TABLES (DONE - 2006-04-24 - Hiram)
 #	RE-DONE 2006-05-03 with danRer4 in place of danRer3
 ## Redone to correct usage of nBeds and sizes file (2007-03-28 - Hiram)
     ssh kolossus
     mkdir /cluster/data/mm8/bed/multiz17way/anno
     cd /cluster/data/mm8/bed/multiz17way/anno
     mkdir maf run
     cd run
     rm -f sizes nBeds
     twoBitInfo -nBed /cluster/data/mm8/mm8.{2bit,N.bed}
     for DB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
     do
         ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
         ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
 	echo ${DB}.bed  >> nBeds
 	echo ${DB}.len  >> sizes
 	echo $DB
     done
 
     echo '#!/bin/csh -ef' > jobs.csh
     echo date >> jobs.csh
     # do smaller jobs first so you can see some progress immediately:
     for F in `ls -1rS ../../maf/*.maf`
     do
       echo nice mafAddIRows -nBeds=nBeds -sizes=sizes $F \
         /cluster/data/mm8/mm8.2bit ../maf/`basename $F` >> jobs.csh
       echo "echo $F" >> jobs.csh
     done 
     echo date >> jobs.csh
     chmod +x jobs.csh
     time ./jobs.csh > jobs.log 2>&1 &
     #	to watch progress;
     tail -f jobs.log
     #	real    218m16.272s
 
     # Load anno/maf
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way/anno/maf
     mkdir -p /gbdb/mm8/multiz17way/anno/maf
     ln -s /cluster/data/mm8/bed/multiz17way/anno/maf/*.maf \
       /gbdb/mm8/multiz17way/anno/maf
     time nice -n +19 hgLoadMaf \
 	-pathPrefix=/gbdb/mm8/multiz17way/anno/maf mm8 multiz17way
     #	Loaded 12484442 mafs in 34 files from /gbdb/mm8/multiz17way/anno/maf
     #	real    8m14.757s
 
     # Do the computation-intensive part of hgLoadMafSummary on a workhorse 
     # machine and then load on hgwdev:
     ssh hgwdev64
     cd /cluster/data/mm8/bed/multiz17way/anno/maf
     time cat *.maf | \
     	nice -n +19 hgLoadMafSummary mm8 -minSize=30000 -mergeGap=1500 \
 	-maxSize=200000 -test multiz17waySummary stdin
     #	Created 3153839 summary blocks from 65123362 components
     #	and 12484442 mafs from stdin
     #	real    13m25.961s
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way/anno/maf
     time nice -n +19 hgLoadSqlTab mm8 multiz17waySummary \
 	~/kent/src/hg/lib/mafSummary.sql multiz17waySummary.tab
     #	real    0m53.525s
     rm *.tab
 
 #######################################################################
 # MULTIZ17WAY MAF FRAMES (DONE - 2006-04-24 - 2006-04-25 - Hiram)
 #	RE-DONE 2006-05-03 to replace danRer3 with danRer4
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/multiz17way/frames
     cd /cluster/data/mm8/bed/multiz17way/frames
     # The following is adapted from MarkD's Makefile used for mm7...
 
     #------------------------------------------------------------------------
     # get the genes for all genomes
     # mRNAs with CDS.  single select to get cds+psl, then split that up and
     # create genePred
     # using mrna table as genes:
     mkdir genes
     for qDB in oryCun1 panTro2 rheMac2 canFam2 bosTau2 danRer4 loxAfr1 \
 	tetNig1 fr1
     #	single danRer4 re-run 2006-05-03, removed danRer3
     for qDB in danRer4
     do
       tmpExt=`mktemp temp.XXXXXX`
       tmpMrnaCds=${qDB}.mrna-cds.${tmpExt}
       tmpMrna=${qDB}.mrna.${tmpExt}
       tmpCds=${qDB}.cds.${tmpExt}
       echo $qDB
       hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
                    from all_mrna,gbCdnaInfo,cds \
                    where (all_mrna.qName = gbCdnaInfo.acc) and \
                      (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
        ${qDB} > ${tmpMrnaCds}
       cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
       cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
       mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} \
         stdout \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
     #	tried to use monDom4 in the above loop, but got this error:
     #	(450211944 450214274) out of range (0 400000000) in binKeeperAdd
     #	Which is interesting.  This should be looked into to see why
     #	this is here.
 
     # using knownGene for rn4 mm8 hg18
     # using refGene for galGal2
     # using mgcGenes for xenTro2
     # no genes for monDom4 dasNov1 echTel1
     # genePreds; (must keep only the first 10 columns for knownGene)
     for qDB in rn4 mm8 hg18 galGal2 xenTro2
     do
       if [ $qDB = "xenTro2" ]; then
         geneTbl=mgcGenes
       elif [ $qDB = "galGal2" ]; then
         geneTbl=refGene
       else
         geneTbl=knownGene
       fi
       echo hgsql -N -e 'select * from '"$geneTbl ${qDB}"
       hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
       | genePredSingleCover stdin stdout | gzip -2c \
         > /scratch/tmp/$qDB.tmp.gz
       mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
       rm -f $tmpExt
     done
 
     #------------------------------------------------------------------------
     # create frames
     #	beware, BASH syntax here ...
     # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
     clusterDir=/cluster/bluearc/mm8/multiz17wayFrames
     multizDir=/cluster/data/mm8/bed/multiz17way
     mafDir=$multizDir/mafDownloads
     geneDir=$multizDir/frames/genes
     clusterMafDir=${clusterDir}/maf
     clusterGeneDir=${clusterDir}/genes
     clusterFramesDir=${clusterDir}/mafFrames.kki
 
     # copy mafs to cluster storage
     mkdir $clusterDir
     ssh -x kkstore04 "rsync -av $mafDir/*.maf.gz $clusterMafDir/"
 
     # copy genes to cluster storage
     ssh -x kkstore04 "rsync -av $geneDir/*.gp.gz $clusterGeneDir/"
 
     # run cluster jobs
     tmpExt=`mktemp temp.XXXXXX`
     paraDir=$multizDir/frames/para.${tmpExt}
     cd /cluster/data/mm8/bed/multiz17way/frames
     mkdir mafFrames $paraDir
     mkdir ${clusterFramesDir}
     for qDB in `cat /cluster/data/mm8/bed/multiz17way/species.lst`
     do
       mkdir ${clusterFramesDir}/${qDB}
       for C in `awk '{print $1;}' /cluster/data/mm8/chrom.sizes`
       do
         if [ -e ${clusterGeneDir}/${qDB}.gp.gz ]; then
           echo /cluster/bin/scripts/mkMafFrames.pl ${qDB} mm8 \
             ${clusterGeneDir}/${qDB}.gp.gz ${clusterMafDir}/$C.maf.gz \
             ${clusterFramesDir}/${qDB}/$C.mafFrames \
             >> $paraDir/jobList
         fi
       done
     done
     rm -f $tmpExt
     ssh -x kki "cd ${paraDir} && para make jobList && para time"
 # Completed: 476 of 476 jobs
 # CPU time in finished jobs:       6235s     103.91m     1.73h    0.07d  0.000 y
 # IO & Wait Time:                 13538s     225.64m     3.76h    0.16d  0.000 y
 # Average job time:                  42s       0.69m     0.01h    0.00d
 # Longest finished job:             237s       3.95m     0.07h    0.00d
 # Submission to last job:          1242s      20.70m     0.34h    0.01d
 
     # combine results from cluster
     for qDB in \
 	`sed -e "s/ dasNov1//; s/ echTel1//; s/ monDom4//;" ../species.lst`
     do
       ssh -x kolossus "cat ${clusterFramesDir}/${qDB}/*.mafFrames | gzip -2c > ${multizDir}/frames/mafFrames/${qDB}.mafFrames.gz"
       echo "${qDB}"
     done
 
     #------------------------------------------------------------------------
     # load the database
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way/frames
     time nice -n +19 hgLoadMafFrames mm8 multiz17wayFrames \
 	mafFrames/*.mafFrames.gz
     #	real    1m11.457s
 
     #------------------------------------------------------------------------
     # clean up
     rm -rf ${clusterDir}
 
     ###
     # rebuild frames to get bug fix, using 1-pass maf methodology (2006-06-09 markd)
     ssh kkstore04
     cd /cluster/data/mm8/bed/multiz17way/frames
     mv mafFrames/ mafFrames.old
     nice tcsh # easy way to get process niced
     (cat  ../maf/*.maf | time genePredToMafFrames mm8 stdin stdout bosTau2 genes/bosTau2.gp.gz canFam2 genes/canFam2.gp.gz danRer4 genes/danRer4.gp.gz fr1 genes/fr1.gp.gz galGal2 genes/galGal2.gp.gz hg18 genes/hg18.gp.gz loxAfr1 genes/loxAfr1.gp.gz mm8 genes/mm8.gp.gz oryCun1 genes/oryCun1.gp.gz panTro2 genes/panTro2.gp.gz rheMac2 genes/rheMac2.gp.gz rn4 genes/rn4.gp.gz tetNig1 genes/tetNig1.gp.gz xenTro2 genes/xenTro2.gp.gz  bosTau2 genes/bosTau2.gp.gz |  gzip >multiz17way.mafFrames.gz)>&log&
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way/frames
 
     hgLoadMafFrames mm8 multiz17wayFrames multiz17way.mafFrames.gz |&mail markd&
 
 
 ############################################################################
 # CREATE CONSERVATION WIGGLE WITH PHASTCONS
 #		(DONE - 2006-03-02 - Hiram)
 #	(RE-DONE - 2006-04-25 with panTro2 and xenTro2 - Hiram)
 #	(RE-DONE - 2006-05-03 with danRer4 instead of danRer3 - Hiram)
 
     #	Will skip this estimate for Mm8 since it was well done in Mm7
     #	and in Hg17, skip to the creation of the SS files
 # Estimate phastCons parameters
     ssh kkstore01
     mkdir /cluster/data/mm8/bed/multiz17way/cons
     cd /cluster/data/mm8/bed/multiz17way/cons
 
     # Create a starting-tree.mod based on chr2 (the largest one)
     /cluster/bin/phast/$MACHTYPE/msa_split ../maf/chr2.maf \
 	--refseq ../../../2/chr2.fa --in-format MAF \
 	--windows 100000000,1000 --out-format SS \
 	--between-blocks 5000 --out-root s1
     #	10 minutes
 
     /cluster/bin/phast/$MACHTYPE/phyloFit -i SS s1.*.ss \
 --tree "((((((((((hg18,panTro2),rheMac2),((rn4,mm8),oryCun1)),(bosTau2,canFam2)),dasNov1),(loxAfr1,echTel1)),monDom4),galGal2),xenTro2),((tetNig1,fr1),danRer4))" \
     --out-root starting-tree
     #	real    840m53.157s
     #	That is 14 hours !
 
     rm s1.*.ss
     # add up the C and G:
     grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
     #	0.407
     #	This 0.407 is used in the --gc argument below
 
     #	CONTINUE HERE, no estimation required
     # Create big bad bloated SS files on san filesystem (takes ~ 2h 20m)
     #	Increasing their size this time from 1,000,000 to 10,000,000 to
     #	slow down the phastCons pk jobs
     ssh kkstore04
     mkdir -p  /san/sanvol1/scratch/mm8/cons/ss
     cd  /san/sanvol1/scratch/mm8/cons/ss
     time for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes`
     do
       if [ -s /cluster/data/mm8/bed/multiz17way/maf/${C}.maf ]; then
 	mkdir ${C}
 	echo msa_split $C
 	chrN=${C/chr/}
 	chrN=${chrN/_random/}
 	/cluster/bin/phast/$MACHTYPE/msa_split \
 	    /cluster/data/mm8/bed/multiz17way/maf/${C}.maf \
 	    --refseq /cluster/data/mm8/${chrN}/${C}.fa \
 	    --in-format MAF --windows 4000000,0 --between-blocks 5000 \
 	    --out-format SS --out-root ${C}/${C}
       fi
     done &
     #	real    94m49.273s
 
     #  Again, going to SKIP this tuning business this time and use the
     #  previous numbers.
 
     # Create a random list of 50 1 mb regions  (do not use the _randoms)
     cd /san/sanvol1/scratch/mm8/cons/ss
     ls -1l chr*/chr*.ss | grep -v random | \
 	awk '$5 > 4000000 {print $9;}' | randomLines stdin 50 ../randomSs.list
 
     # Set up parasol directory to calculate trees on these 50 regions
     ssh pk
     mkdir /san/sanvol1/scratch/mm8/cons/treeRun1
     cd /san/sanvol1/scratch/mm8/cons/treeRun1
     mkdir tree log
 
     #	Tuning this loop should come back to here to recalculate 
     # Create little script that calls phastCons with right arguments
     #	--target-coverage of 0.20 is about right for mouse, will be
     #	tuned exactly below
     cat > makeTree.csh << '_EOF_'
 #!/bin/csh -fe
 set C=$1:h
 mkdir -p log/${C} tree/${C}
     /cluster/bin/phast/$MACHTYPE/phastCons ../ss/$1 \
       /cluster/data/mm8/bed/multiz17way/cons/starting-tree.mod \
       --gc 0.407 --nrates 1,1 --no-post-probs --ignore-missing \
       --expected-lengths 12 --target-coverage 0.17 \
       --quiet --log log/$1 --estimate-trees tree/$1
 '_EOF_'
     #	<< happy emacs
     chmod a+x makeTree.csh
 
     # Create gensub file
     cat > template << '_EOF_'
 #LOOP
 makeTree.csh $(path1)
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     # Make cluster job and run it
     gensub2 ../randomSs.list single template jobList
     para create jobList
     para try/push/check/etc
 # Completed: 50 of 50 jobs
 # CPU time in finished jobs:     354644s    5910.74m    98.51h    4.10d  0.011 y
 # IO & Wait Time:                   352s       5.86m     0.10h    0.00d  0.000 y
 # Average job time:                7100s     118.33m     1.97h    0.08d
 # Longest finished job:           29358s     489.30m     8.15h    0.34d
 # Submission to last job:         29446s     490.77m     8.18h    0.34d
 
     # Now combine parameter estimates.  We can average the .mod files
     # using phyloBoot.  This must be done separately for the conserved
     # and nonconserved models
     ssh kkstore01
     cd /san/sanvol1/scratch/mm8/cons/treeRun1
     ls -1 tree/chr*/*.cons.mod > cons.list
     time /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*cons.list' \
 	--output-average ../ave.cons.mod > cons_summary.txt 2>&1 &
     ls -1 tree/chr*/*.noncons.mod > noncons.list
     /cluster/bin/phast/$MACHTYPE/phyloBoot --read-mods '*noncons.list' \
 	--output-average ../ave.noncons.mod > noncons_summary.txt
     cd ..
     cp -p ave.*.mod /cluster/data/mm8/bed/multiz17way/cons
 
     #	measuring entropy
     #	consEntopy <target coverage> <expected lengths>
     #		 ave.cons.mod ave.noncons.mod --NH 9.78
     #	never stops with the --NH argument
     /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 \
                         ave.cons.mod ave.noncons.mod
 XXXX - does not work:  2005-11-28
 [hiram@kkstore01 /san/sanvol1/scratch/mm8/cons] /cluster/bin/phast/$MACHTYPE/consEntropy .17 12 ave.cons.mod ave.noncons.mod
 ERROR: with no separate source alignment, ss_from_msas expects sequences of positive length and no SS object.
 
 #Transition parameters:gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
 # Relative entropy: H=1.454874 bits/site
 # Required length: N=7.596943 sites
 # Total entropy: NH=11.052595 bits
 
 # consEntropy .20 12 ave.cons.mod.1 ave.noncons.mod.1
 # Transition params: gamma=0.200000, omega=12.000000, mu=0.083333, nu=0.020833
 # Relative entropy: H=1.454874 bits/site
 # Required length: N=6.629337 sites
 # Total entropy: NH=9.644850 bits
 
 # consEntropy .10 12 ave.cons.mod.2 ave.noncons.mod.2
 # Transition params: gamma=0.100000, omega=12.000000, mu=0.083333, nu=0.009259
 # Relative entropy: H=1.527815 bits/site
 # Required length: N=7.205526 sites
 # Total entropy: NH=11.008713 bits
 
 # consEntropy .20 8 ave.cons.mod.3 ave.noncons.mod.3
 # Transition params: gamma=0.200000, omega=8.000000, mu=0.125000, nu=0.031250
 # Relative entropy: H=1.654878 bits/site
 # Required length: N=5.146793 sites
 # Total entropy: NH=8.517313 bits
 
 ### !!! ***  This one with .17 and 12 is the one that was finally used
 # consEntropy .17 12 ave.cons.mod.4 ave.noncons.mod.4
 # Transition params: gamma=0.170000, omega=12.000000, mu=0.083333, nu=0.017068
 # Relative entropy: H=1.478838 bits/site
 # Required length: N=6.753382 sites
 # Total entropy: NH=9.987159 bits
 
     #	SKIP to here passing by the tuning numbers
     ssh pk
     # Create cluster dir to do main phastCons run
     mkdir /san/sanvol1/scratch/mm8/cons/consRun3
     cd /san/sanvol1/scratch/mm8/cons
     cp /san/sanvol1/scratch/mm7/cons/elliotsEncode.mod .
     #	edit, change monDom2 to monDom4, hg17 to hg18, rheMac1 to
     #	rheMac2, rn3 to rn4, mm7 to mm8
     #	danRer3 to danRer4
     #	It looks like:
 ALPHABET: A C G T
 ORDER: 0
 SUBST_MOD: REV
 TRAINING_LNL: -988246.132962
 BACKGROUND: 0.295 0.205 0.205 0.295
 RATE_MAT:
   -1.165221    0.315494    0.589884    0.259843
    0.189778   -0.878194    0.208718    0.479698
    0.444622    0.261535   -0.885604    0.179447
    0.234867    0.720815    0.215191   -1.170872
 TREE: (((((((((((((hg18:0.006690,panTro2:0.007571):0.024272,(colobus_monkey:0.015404,(baboon:0.008258,rheMac2:0.028617):0.008519):0.022120):0.023960,(dusky_titi:0.025662,(owl_monkey:0.012151,marmoset:0.029549):0.008236):0.027158):0.066101,(mouse_lemur:0.059024,galago:0.121375):0.032386):0.017073,((rn4:0.081728,mm8:0.077017):0.229273,oryCun1:0.206767):0.023340):0.023026,(((bosTau2:0.159182,canFam2:0.147731):0.004946,rfbat:0.138877):0.010150,(hedgehog:0.193396,shrew:0.261724):0.054246):0.024354):0.028505,dasNov1:0.149862):0.015994,(loxAfr1:0.104891,echTel1:0.259797):0.040371):0.218400,monDom4:0.371073):0.065268,platypus:0.468116):0.123856,galGal2:0.454691):0.123297,xenTro2:0.782453):0.156067,((tetNig1:0.199381,fr1:0.239894):0.492961,danRer4:0.782561):0.156067);
 
 
     cd /san/sanvol1/scratch/mm8/cons/consRun3
     mkdir ppRaw bed
 
     # Create script to run phastCons with right parameters
     #	These parameters:
     #	--rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
     #	were taken from Kate's 17-way in Hg17, removing the
     #	--not-informative panTro2 since that isn't relevant here, nor
     #	would be --not-informative rn4 - Jim says rn4 is far enough away
     #	from mm8 that it is informative.
     #	This job is I/O intensive in its output files, thus it is all
     #	working over in /scratch/tmp/
     cat > doPhast << '_EOF_'
 #!/bin/csh -fe
 mkdir /scratch/tmp/${2}
 cp -p ../ss/${1}/${2}.ss ../elliotsEncode.mod /scratch/tmp/${2}
 pushd /scratch/tmp/${2} > /dev/null
 /cluster/bin/phast/${MACHTYPE}/phastCons ${2}.ss elliotsEncode.mod \
    --rho 0.28 --expected-length 14 --target-coverage 0.008 --quiet \
 	--seqname ${1} --idpref ${1} --viterbi ${2}.bed --score > ${2}.pp
 popd > /dev/null
 mkdir -p ppRaw/${1}
 mkdir -p bed/${1}
 mv /scratch/tmp/${2}/${2}.pp ppRaw/${1}
 mv /scratch/tmp/${2}/${2}.bed bed/${1}
 rm /scratch/tmp/${2}/elliotsEncode.mod
 rm /scratch/tmp/${2}/${2}.ss
 rmdir /scratch/tmp/${2}
 '_EOF_'
     # << happy emacs
     chmod a+x doPhast
 
     #	root1 == chrom name, file1 == ss file name without .ss suffix
     # Create gsub file
     cat > template << '_EOF_'
 #LOOP
 doPhast $(root1) $(file1)
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     # Create parasol batch and run it
     ls -1 ../ss/chr*/chr*.ss | sed 's/.ss$//' > in.list
 
     gensub2 in.list single template jobList
     para create jobList
     para try/check/push/etc.
     #	These jobs are very fast and very I/O intensive, even on the san
     #	they will hang it up as they work at full tilt.
 # Completed: 689 of 689 jobs
 # CPU time in finished jobs:      12806s     213.44m     3.56h    0.15d  0.000 y
 # IO & Wait Time:                 16079s     267.98m     4.47h    0.19d  0.001 y
 # Average job time:                  42s       0.70m     0.01h    0.00d
 # Longest finished job:              94s       1.57m     0.03h    0.00d
 # Submission to last job:           350s       5.83m     0.10h    0.00d
 
     # combine predictions and transform scores to be in 0-1000 interval
     #	it uses a lot of memory, so on kolossus:
     ssh kolossus
     cd /san/sanvol1/scratch/mm8/cons/consRun3
     #	The sed's and the sort get the file names in chrom,start order
     #	You might like to verify it is correct by first looking at the
     #	list it produces:
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | less
     #	if that looks right, then let it run:
     #	FOR NEXT TIME - the result file should be named:
     #	phastConsElements17way.bed since that is the name of the DB
     #	table that it is loaded into.  (instead of mostConserved.bed)
     find ./bed -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
 	| /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
     #	~ 1 minute
     cp -p mostConserved.bed /cluster/data/mm8/bed/multiz17way
 
     # Figure out how much is actually covered by the bed file as so:
     #	Get the non-n genome size from faSize on all chroms:
     ssh kkstore01
     cd /cluster/data/mm8
     faSize ?{,?}/chr*.fa
     #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
     #	1089350685 lower) in 34 sequences in 34 files
 
     cd /san/sanvol1/scratch/mm8/cons/consRun3
     #	The 2567283688 comes from the non-n genome as counted above.
     awk '
 {sum+=$3-$2}
 END{printf "%% %.2f = 100.0*%d/2567283688\n",100.0*sum/2567283688,sum}' \
 	mostConserved.bed
     #	--rho 0.28 --expected-length 14 --target-coverage 0.008
     #	% 5.40 = 100.0*138575691/2567283688 danRer4 instead of danRer3
     #	% 5.43 = 100.0*139309333/2567283688 panTro2 and xenTro2
     #	% 5.39 = 100.0*138300407/2567283688 panTro1 and xenTro1
 
     #	Aiming for %70 coverage in
     #	the following featureBits measurement on CDS:
     # Beware of negative scores when too high.  The logToBedScore
     # will output an error on any negative scores.
 
     HGDB_CONF=~/.hg.conf.read-only time nice -n +19 featureBits mm8 \
 	-enrichment refGene:cds mostConserved.bed
     #	--rho 0.28 --expected-length 14 --target-coverage 0.008
     #	with danRer4 instead of danRer3:
     #	refGene:cds 1.062%, mostConserved.bed 5.398%, both 0.743%, cover
     #	69.99%, enrich 12.97x
     #	with panTro2 and xenTro2:
     #	refGene:cds 1.060%, mostConserved.bed 5.426%, both 0.740%, cover
     #	69.85%, enrich 12.87x
     #	with panTro1 and xenTro1:
     #	refGene:cds 1.060%, mostConserved.bed 5.387%, both 0.739%, cover
     #	69.71%, enrich 12.94x
 
     # Load most conserved track into database
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way
     #	the copy was already done above
     # cp -p /san/sanvol1/scratch/mm8/cons/consRun3/mostConserved.bed .
     time nice -n +19 hgLoadBed -strict mm8 phastConsElements17way \
 	mostConserved.bed
     #	Loaded 1883370 elements of size 5
     #	real    2m54.033s
 
     #	should measure the same as above
     time nice -n +19 featureBits mm8 -enrichment refGene:cds \
 	phastConsElements17way
     #	with danRer4 in place of danRer3:
     #	refGene:cds 1.062%, phastConsElements17way 5.398%, both 0.743%,
     #	cover 69.99%, enrich 12.97x
     #	with panTro2 and xenTro2:
     #	refGene:cds 1.060%, phastConsElements 5.426%, both 0.740%, cover
     #	69.85%, enrich 12.87x
     #	with panTro1 and xenTro1:
     #	refGene:cds 1.060%, phastConsElements 5.387%, both 0.739%, cover
     #	69.71%, enrich 12.94x
 
     # Create merged posterier probability file and wiggle track data files
     ssh kkstore04
     cd /san/sanvol1/scratch/mm8/cons/consRun3
     # the sed business gets the names sorted by chromName, chromStart
     #	so that everything goes in numerical order into wigEncode
     #	This was verified above to be correct
     time nice -n +19 find ./ppRaw -type f \
 	| sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat \
 	    | $HOME/bin/$MACHTYPE/wigEncode -noOverlap stdin \
 		phastCons17.wig phastCons17.wib
     #	real    15m59.846s
     #	-rw-rw-r--   1 1961998053 May  3 12:22 phastCons17.wib
     #	-rw-rw-r--   1  237229239 May  3 12:22 phastCons17.wig
 
     time nice -n +19 cp -p phastCons17.wi? /cluster/data/mm8/bed/multiz17way/
     #	real    1m21.329s
 
     #	prepare compressed copy of ascii data values for downloads
     ssh pk
     cd /san/sanvol1/scratch/mm8/cons/consRun3
     cat << '_EOF_' > gzipAscii.sh
 #!/bin/sh
 
 TOP=`pwd`
 export TOP
 
 mkdir -p phastCons17Scores
 
 for D in ppRaw/chr*
 do
     C=${D/ppRaw\/}
     out=phastCons17Scores/${C}.data.gz
     echo "========================== ${C} ${D}"
     find ./${D} -type f | sed -e "s#/# x #g; s#\.# y #g; s#-# z #g" \
 	| sort -k7,7 -k9,9n \
 	| sed -e "s# z #-#g; s# y #\.#g; s# x #/#g" | xargs cat |
 	    gzip > ${out}
 done
 '_EOF_'
     #	<< happy emacs
     chmod +x gzipAscii.sh
     time nice -n +19 ./gzipAscii.sh
     #	real    18m15.212s
 
     #	copy them for downloads
     ssh kkstore04
     #	this directory is actually a symlink from store9 to store8 to
     #	avoid the data full problem on store9
     mkdir /cluster/data/mm8/bed/multiz17way/phastCons17Scores
     cd /cluster/data/mm8/bed/multiz17way/phastCons17Scores
     cp -p  /san/sanvol1/scratch/mm8/cons/consRun3/phastCons17Scores/* .
 
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/mm8
     ln -s /cluster/data/mm8/bed/multiz17way/phastCons17Scores .
 
     # Load gbdb and database with wiggle.
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way
     ln -s `pwd`/phastCons17.wib /gbdb/mm8/wib/phastCons17.wib
     time nice -n +19 hgLoadWiggle mm8 phastCons17 phastCons17.wig
     #	real    2m55.836s
 
     #  Create histogram to get an overview of all the data
     ssh hgwdev
     cd /cluster/data/mm8/bed/multiz17way
     time nice -n +19 hgWiggle -doHistogram \
 	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
 	    -db=mm8 phastCons17 > histogram.data 2>&1
     #	real    28m24.388s
 
     #	create plot of histogram:
 
     cat << '_EOF_' | gnuplot > histo.png
 set terminal png small color \
         x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
 set size 1.4, 0.8
 set key left box
 set grid noxtics
 set grid ytics
 set title " Mouse Mm8 Histogram phastCons17 track"
 set xlabel " phastCons17 score"
 set ylabel " Relative Frequency"
 set y2label " Cumulative Relative Frequency (CRF)"
 set y2range [0:1]
 set y2tics
 set yrange [0:0.02]
 
 plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
         "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
 '_EOF_'
     #	<< happy emacs
 
     display histo.png &
 
 # QA NOTE: (ASZ: 5/1/2006) changed name of phastConsElements table to phastConsElements17way
 # QA NOTE: (ASZ: 5/1/2006) changed name of phastCons17 table to phastCons17way
 # Hiram Note:  phastCons17 never changed to phastCons17way at any time
 
 #########################################################################
 # MAKE FOLDUTR TABLES (DONE 2006-02-28, Fan)
 # First set up directory structure and extract UTR sequence on hgwdev
     ssh hgwdev
     cd /cluster/data/mm8/bed
     rm rnaStruct
 
     mkdir /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28
     ln -s /san/sanvol1/scratch/mm8/rnaStruct.2006-02-28 rnaStruct
     cd rnaStruct
     mkdir -p utr3/split utr5/split utr3/fold utr5/fold
     utrFa mm8 knownGene utr3 utr3/utr.fa
     utrFa mm8 knownGene utr5 utr5/utr.fa
 
 # Split up files and make files that define job.
     ssh pk
     cd /cluster/data/mm8/bed/rnaStruct
     faSplit sequence utr3/utr.fa 4000 utr3/split/s
     faSplit sequence utr5/utr.fa 4000 utr5/split/s
     ls -1 utr3/split > utr3/in.lst
     ls -1 utr5/split > utr5/in.lst
     cd utr3
     cat > gsub <<end
 #LOOP
 rnaFoldBig split/\$(path1) fold
 #ENDLOOP
 end
     cp gsub ../utr5
 
 # Do cluster run for 3' UTRs
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 3897 of 3897 jobs
 # CPU time in finished jobs:     227530s    3792.17m    63.20h    2.63d  0.007 y
 # IO & Wait Time:                 44046s     734.10m    12.23h    0.51d  0.001 y
 # Average job time:                  70s       1.16m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            1337s      22.28m     0.37h    0.02d
 # Submission to last job:          1886s      31.43m     0.52h    0.02d
 
 # Do cluster run for 5' UTRs 
     cd ../utr5
     gensub2 in.lst single gsub spec
     para create spec
     para try
     para push
 # Completed: 3762 of 3762 jobs
 # CPU time in finished jobs:      42244s     704.07m    11.73h    0.49d  0.001 y
 # IO & Wait Time:                 10250s     170.83m     2.85h    0.12d  0.000 y
 # Average job time:                  14s       0.23m     0.00h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            2014s      33.57m     0.56h    0.02d
 # Submission to last job:          2083s      34.72m     0.58h    0.02d
 
 # Load database
     ssh hgwdev
     cd /cluster/data/mm8/bed/rnaStruct/utr5
     hgLoadRnaFold mm8 foldUtr5 fold
     cd ../utr3
     hgLoadRnaFold mm8 foldUtr3 fold
 
 # Clean up
     rm -r split fold err batch.bak
     cd ../utr5
     rm -r split fold err batch.bak
 
 # Build KEGG pathway tables.  (DONE 3/8/06.  Fan)
    ssh hgwdev
    cd /cluster/store9/kg/kgMm8A
    md kegg
    cd kegg
 
    ~/src/hg/protein/KGpath.sh kgMm8A mm8 060115
 
    hgsql mm8 -e "drop table keggMapDesc"
    hgsql mm8 -e "drop table keggPathway"
    hgsql mm8 <~/src/hg/lib/keggMapDesc.sql
    hgsql mm8 <~/src/hg/lib/keggPathway.sql
    hgsql mm8 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'
    hgsql mm8 -e 'load data local infile "keggPathway.tab" into table keggPathway'
   
 # Build CGAP pathway tables
 
    cd ..
    ~/src/hg/protein/KGcgap.sh kgMm8A mm8 060115
 
    cat cgapBIOCARTAdesc.tab |sort -u > cgapBIOCARTAdescSorted.tab
    hgsql mm8 -e "drop table cgapAlias"
    hgsql mm8 -e "drop table cgapBiocDesc"
    hgsql mm8 -e "drop table cgapBiocPathway"
    hgsql mm8 <~/src/hg/lib/cgapAlias.sql
    hgsql mm8 <~/src/hg/lib/cgapBiocDesc.sql
    hgsql mm8 <~/src/hg/lib/cgapBiocPathway.sql
 
    hgsql mm8 -e 'load data local infile "cgapAlias.tab" into table cgapAlias'
    hgsql mm8 -e 'load data local infile "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc'
    hgsql mm8 -e 'load data local infile "cgapBIOCARTA.tab" into table cgapBiocPathway'
 
 ####################################################################################
 
 # BUILD PROTEOME BROWSER TABLES FOR mm8 (DONE 3/8/06, Fan) 
 
 # These are instructions for building tables needed for the Proteome Browser. 
  
 # DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap table
 # ARE REBUILT.  
 # This build is based on proteins DBs dated 060115.
 
 # Create the working directory
 
     ssh hgwdev
     mkdir /cluster/store9/kg/kgMm8A/pb-2006-03-08
     cd /cluster/data/mm8/bed
     rm pb
     ln -s /cluster/store9/kg/kgMm8A/pb-2006-03-08 pb
     cd pb
 
 # Define pep* tables in mm8 DB
 
 	cat ~/kent/src/hg/lib/pep*.sql > pepAll.sql
 
 #  First edit out pepPred table definition, then
 
 	hgsql mm8 < pepAll.sql
 
 # Build the pepMwAa table
 
   hgsql proteins060115 -N -e \
 "select info.acc, molWeight, aaSize from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab
 
 hgsql mm8 -e 'load data local infile "pepMwAa.tab" into table pepMwAa'
 
 o Build the pepPi table
 
     hgsql proteins060115 -e \
     "select info.acc from sp060115.info, sp060115.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.lis
 
     hgsql mm8 -N -e 'select proteinID from knownGene where proteinID like "%-%"' | sort -u >> protAcc.lis
 
     pbCalPi protAcc.lis sp060115 pepPi.tab
     hgsql mm8 -e 'delete from pepPi'
     hgsql mm8 -e 'load data local infile "pepPi.tab" into table mm8.pepPi'
 
 # Calculate and load pep distributions
 
     pbCalDist sp060115 proteins060115 10090 mm8 >pbCalDist.out
     wc  pbCalDist.out
 
     hgsql mm8
     load data local infile "pepExonCntDist.tab" into table mm8.pepExonCntDist;
     load data local infile "pepCCntDist.tab" into table mm8.pepCCntDist;
     load data local infile "pepHydroDist.tab" into table mm8.pepHydroDist;
     load data local infile "pepMolWtDist.tab" into table mm8.pepMolWtDist;
     load data local infile "pepResDist.tab" into table mm8.pepResDist;
     load data local infile "pepIPCntDist.tab" into table mm8.pepIPCntDist;
     load data local infile "pepPiDist.tab" into table mm8.pepPiDist;
     quit
 
 # Calculate frequency distributions
 
     pbCalResStd sp060115 10090 mm8
 
 # Create pbAnomLimit and pbResAvgStd tables
 
    hgsql mm8 -e "drop table pbAnomLimit"
    hgsql mm8 -e "drop table pbResAvgStd"
    hgsql mm8 < ~/src/hg/lib/pbAnomLimit.sql
    hgsql mm8 < ~/src/hg/lib/pbResAvgStd.sql
 
    hgsql mm8 -e 'load data local infile "pbResAvgStd.tab" into table mm8.pbResAvgStd;'
    hgsql mm8 -e 'load data local infile "pbAnomLimit.tab" into table mm8.pbAnomLimit;'
 
 # Create pbStamp table for PB
   hgsql mm8 -e "drop table pbStamp"
   hgsql mm8 < ~/src/hg/lib/pbStamp.sql
   hgsql mm7 -N -e 'select * from pbStamp' > pbStamp.tab
   hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'
 
 # ENABLE PB FOR mm8 IN HGCENTRALTEST 
 
     echo " insert into gdbPdb values('mm8', 'proteins060115')" \
       | hgsql -h genome-testdb hgcentraltest
 
     echo "update dbDb set hgPbOk = 1 where name = 'mm8';" \
       | hgsql -h genome-testdb hgcentraltest
 
 # Adjust drawing parameters for Proteome Browser stamps
 
   Now invoke Proteome Browser and adjust various drawing parameters
   (mostly the ymax of each stamp) if necessary, by updating the 
   pbStamp.tab file and then delete and reload the pbStamp table. 
 
   hgsql mm8 -e "drop table pbStamp"
   hgsql mm8 < ~/src/hg/lib/pbStamp.sql
   hgsql mm8 -e 'load data local infile "pbStamp.tab" into table mm8.pbStamp'
 
 # Perform preliminary review of Proteome Browser for mm8, then
   notify QA for formal review.
 
 # BUILD MISC STUFF FOR KG
 
 # Build mrnaRefseq table
 
 # First make sure the entrez DB is updated. (recently updated on 2/8/06).
 
     ssh hgwdev
     cd /cluster/store9/kg/kgMm8A
 
     hgsql entrez -N -e \
      'select mrna, refseq from entrezRefseq, entrezMrna, mm8.all_mrna where qName=mrna and    entrezRefseq.geneID=entrezMrna.geneID' \
       >mrnaRefseq1.tab.tab
 
     hgsql mm8 -N -e 'select name, name from refGene' >mrnaRefseq2.tab
 
     cat mrnaRefseq1.tab mrnaRefseq2.tab |sort -u >mrnaRefseq.tab
 
     hgsql mm8 -e 'drop table mrnaRefseq'
     hgsql mm8 < ~/src/hg/lib/mrnaRefseq.sql
     hgsql mm8 -e 'load data local infile "mrnaRefseq.tab" into table mrnaRefseq'
 
 # CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 3/8/06 Fan)
 # This depends on the go and uniProt databases as well as 
 # the kgAlias and kgProAlias tables.  The hgKgGetText takes
 # about 5 minutes when the database is not too busy.  The rest
 # is real quick.
      ssh hgwdev
      cd /cluster/store9/kg/kgMm8A
      mkdir index
      cd index
      hgKgGetText mm8 knownGene.text
      ixIxx knownGene.text knownGene.ix knownGene.ixx
      ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ix  /gbdb/mm8/knownGene.ix
      ln -s /cluster/store9/kg/kgMm8A/index/knownGene.ixx /gbdb/mm8/knownGene.ixx
 
 # BUILD KNOWN GENE LIST FOR GOOGLE.  
 # make knownGeneLists.html mm8GeneList.html mm5GeneList.html rm3GeneList.html
 
     cd /cluster/data/mm8/bed
     rm -rf knownGeneList/mm8
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/mm8
 
     hgKnownGeneList mm8
 
 # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
     mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
     cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8
 
 ##################################################################################
 # Create description.html for mm8
 
 mkdir -p ~/kent/src/hg/makeDb/trackDb/mouse/mm8
 cd ~/kent/src/hg/makeDb/trackDb/mouse/mm8
 cp ../hg17/description.html .
 
 vi description.html
 # Change release date and build number and change hg17 to mm8
 # Check it into CVS
 
 mkdir -p /cluster/data/mm8/html
 cp -p description.html /cluster/data/mm8/html
 
 ln -s /cluster/data/mm8/html/description.html /gbdb/mm8/html/description.html
 
 # BUILD GENE SORTER TABLES (AKA: FAMILY BROWSER) (STARTED 2006-03-08, DONE 2006-02-14 - Fan)
 #	This should be done after KG tables are complete from known genes build
 #	process.
 #
 # Cluster together various alt-splicing isoforms.
 #	Creates the knownIsoforms and knownCanonical tables
 ssh hgwdev
 mkdir /cluster/data/mm8/bed/geneSorter.2006-03-08
 # remove old symbolic link
 rm /cluster/data/mm8/bed/geneSorter
 ln -s /cluster/data/mm8/bed/geneSorter.2006-03-08 /cluster/data/mm8/bed/geneSorter
 cd /cluster/data/mm8/bed/geneSorter
 hgClusterGenes mm8 knownGene knownIsoforms knownCanonical
 
 # Extract peptides from knownGenes into fasta file
 # and create a blast database out of them.
 mkdir /cluster/data/mm8/bed/geneSorter/blastp
 cd /cluster/data/mm8/bed/geneSorter/blastp
 pepPredToFa mm8 knownGenePep known.faa
 #	You may need to build this binary in src/hg/near/pepPredToFa
 /scratch/blast/formatdb -i known.faa -t known -n known
 #	This command is in /projects/compbio/bin/$MACH/formatdb
 
 # Copy over database to bluearc
 rm -fr /cluster/bluearc/mm8/blastp
 mkdir -p /cluster/bluearc/mm8/blastp
 cp -p /cluster/data/mm8/bed/geneSorter/blastp/known.* /cluster/bluearc/mm8/blastp
 
 # Split up fasta file into bite sized chunks for cluster
 cd /cluster/data/mm8/bed/geneSorter/blastp
 mkdir split
 faSplit sequence known.faa 8000 split/kg
 
 # Make parasol run directory
 ssh pk
 mkdir /cluster/data/mm8/bed/geneSorter/blastp/self
 cd /cluster/data/mm8/bed/geneSorter/blastp/self
 mkdir run
 cd run
 mkdir out
 
 # Make blast script
 cat  << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data /cluster/bluearc/blast229/blastall \
 -p blastp -d /cluster/bluearc/mm8/blastp/known -i $1 -o $2 \
 -e 0.01 -m 8 -b 1000
 '_EOF_'
     # << keep emacs happy
 chmod +x blastSome
 
 # Make gensub2 file
 cat  << '_EOF_' > gsub
 #LOOP
 blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
 #ENDLOOP
 '_EOF_'
     # << keep emacs happy
 
 # Create parasol batch
 #	'ls ../../split/*.fa' is too much, hence the echo
 echo ../../split/*.fa | wordLine stdin > split.lst
 gensub2 split.lst single gsub jobList
 para create jobList
 para try
 para push
 para check
 Completed: 7730 of 7730 jobs
 CPU time in finished jobs:      35194s     586.56m     9.78h    0.41d  0.001 y
 IO & Wait Time:                 29033s     483.89m     8.06h    0.34d  0.001 y
 Average job time:                   8s       0.14m     0.00h    0.00d
 Longest running job:                0s       0.00m     0.00h    0.00d
 Longest finished job:              43s       0.72m     0.01h    0.00d
 Submission to last job:           206s       3.43m     0.06h    0.00d
 
 # Load into database.  This takes about 20 minutes
 ssh hgwdev
 cd /cluster/data/mm8/bed/geneSorter/blastp/self/run/out
 bash
 time hgLoadBlastTab mm8 knownBlastTab *.tab
 # Scanning through 7730 files
 # Loading database with 5270545 rows
 # real    13m30.534s
 
 cd /cluster/data/mm8/bed/geneSorter
 # Create table that maps between known genes and RefSeq
 hgMapToGene mm8 refGene knownGene knownToRefSeq
 
 # Create table that maps between known genes and LocusLink
 hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm8 > refToLl.txt
 hgMapToGene mm8 refGene knownGene knownToLocusLink -lookup=refToLl.txt
 hgsql -e "select count(*) from knownToLocusLink;" mm8
 # 27636
 
 # Create table that maps between known genes and Pfam domains
 hgMapViaSwissProt mm8 knownGene name proteinID Pfam knownToPfam
 hgsql -e "select count(*) from knownToPfam;" mm8
 # 29479
 
 ############################################################################
 
 ### MAKE THE affyU74 TRACK - needed for the Gene Sorter (DONE
 #                              
 # MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
 # target sequences. Recalculate alignments and load data
 ----------------------------------
 # Load up semi-local disk with target sequences for Affy mouse U74 chips.
 # ssh kkr1u00
 # mkdir -p /iscratch/i/affy
 #	This /projects filesystem is not available on kkr1u00
 #	but it is on kk
 # ssh kk
 # cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy
 
 ssh kkr1u00
 iSync
 
 # Run cluster job to do alignments
 ssh kk
 mkdir /cluster/data/mm8/
 cd /cluster/data/mm8/bed/affyU74.2006-03-08
 mkdir run
 cd run
 mkdir psl
 #echo /scratch/mus/mm8/maskedContigs/*.fa | wordLine stdin > genome.lst
 echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
 ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
 cat << '_EOF_' > gsub
 #LOOP
 /cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc $(path1) {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
 # << this line makes emacs coloring happy
 
 gensub2 genome.lst affy.lst gsub jobList
 para create jobList
 para try
 para check
 para push
 # Completed: 102 of 102 jobs
 # CPU time in finished jobs:       5846s      97.43m     1.62h    0.07d  0.000 y
 # IO & Wait Time:                   367s       6.12m     0.10h    0.00d  0.000 y
 # Average job time:                  61s       1.02m     0.02h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             189s       3.15m     0.05h    0.00d
 # Submission to last job:           200s       3.33m     0.06h    0.00d
 
 # Do sort, best in genome filter, and convert to chromosome coordinates
 # to create affyU74.psl.
 ssh kk
 cd /cluster/data/mm8/bed/affyU74.2006-03-08/run
 pslSort dirs raw.psl tmp psl
 
 # change filter parameters for these sequences. only use alignments that
 # cover 30% of sequence and have at least minAli = 0.95.
 # minAli = 0.97 too high. low minCover as a lot of n's in these sequences
 #pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
 pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl ../all_affyU74.psl /dev/null
 
 # Sort by chromosome and load into database.
 
 ssh hgwdev
 cd /cluster/data/mm8/bed/affyU74.2006-03-08
 pslSortAcc nohead chrom temp all_affyU74.psl
 cat chrom/*.psl > affyU74.psl
 
 # shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 # and reload data into table
 
 mv affyU74.psl affyU74.psl.orig
 
 cut -f 1-9 affyU74.psl.orig >j1.tmp
 cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
 cut -f 11-21 affyU74.psl.orig >j3.tmp
 paste j1.tmp j2.tmp j3.tmp >affyU74.psl
 
 hgLoadPsl mm8 affyU74.psl
 rm -rf chrom temp run
 
 ##   MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
 # Make bed files and load consensus sequences for Affy U74 chip set.
 
 #This needs to be done after affyU74 is already made.
 ssh hgwdev
 mkdir -p /cluster/data/mm8/bed/affyGnf.2006-03-08
 cd /cluster/data/mm8/bed/affyGnf.2006-03-08
 #	may need to build this command in src/hg/affyGnf
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
 	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
 	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
 ~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74.2006-03-08/affyU74.psl \
 	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
 	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
 
 # edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
 mkdir sav
 cp *.bed sav -p
 cat sav/affyGnfU74A.bed|sed -e "s/U74Av2://" >affyGnfU74A.bed
 cat sav/affyGnfU74B.bed|sed -e "s/U74Bv2://" >affyGnfU74B.bed
 cat sav/affyGnfU74C.bed|sed -e "s/U74Cv2://" >affyGnfU74C.bed
 
 # and reload data into table
 hgLoadBed -strict mm8 affyGnfU74A affyGnfU74A.bed
 hgLoadBed -strict mm8 affyGnfU74B affyGnfU74B.bed
 hgLoadBed -strict mm8 affyGnfU74C affyGnfU74C.bed
 
 # Add in sequence data for U74 tracks.
 # Copy consensus sequence to /gbdb if it isn't already
 # [THE SYM LINKS WERE ALREADY DONE.]
 #    mkdir -p /gbdb/hgFixed/affyProbes
     cd /gbdb/hgFixed/affyProbes
     # fix broken symlinks after directory structure changed
     # /projects/compbiodata ----> /projects/compbio/data
     rm U74*
     # make correct symlinks (hartera, 2005-05-03)
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
     ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
 
     # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
     # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
     # reload sequences with prefix removed so acc matches name used in
     # other dependent tables
                                                     
     hgLoadSeq -abbr=U74Av2: mm8 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
     hgLoadSeq -abbr=U74Bv2: mm8 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
     hgLoadSeq -abbr=U74Cv2: mm8 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
 
 ### GNF ATLAS 2  (DONE 3/9/06, Fan)
     # Align probes from GNF1M chip.
     ssh kk
     cd /cluster/data/mm8/bed
     mkdir -p geneAtlas2/run/psl
     cd geneAtlas2/run
 
     echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
 
     ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > mrna.lst
     echo '#LOOP\nblat -fine -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
     para try
     para check
     para push
     para time
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:      53165s     886.08m    14.77h    0.62d  0.002 y
 # IO & Wait Time:                   241s       4.02m     0.07h    0.00d  0.000 y
 # Average job time:                1571s      26.18m     0.44h    0.02d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            3929s      65.48m     1.09h    0.05d
 # Submission to last job:          3929s      65.48m     1.09h    0.05d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create gnf1h.psl.
     pslSort dirs raw.psl tmp psl
     pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl ../affyGnf1m.psl /dev/null
 
     #rm -r contig.psl raw.psl psl
 
     # Load probes and alignments from GNF1H into database.
     ssh hgwdev
     cd /cluster/data/mm8/bed/geneAtlas2
 #    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
     hgLoadPsl mm8 affyGnf1m.psl
     hgLoadSeq mm8 /gbdb/hgFixed/affyProbes/gnf1m.fa
 
     # Load up track
     hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
     	affyGnf1m.psl
     # Note that the unmapped 5000 records are from all-N sequences.
     hgLoadBed -strict mm8 gnfAtlas2 gnfAtlas2.bed
 
 # MOUSE AFFYMETRIX MOE430 TRACK (DONE  Fan  2006-03-09)
 #    mkdir -p /projects/compbio/data/microarray/affyMouse
     # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
     # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
 #    unzip MOE430*_consensus.zip
 
     # check for duplicate probes: there are none, all have unique names
     # check for duplicate probes: 100 from 136745_at to 1367551_a_at
     # remove "consensus:" and ";" from FASTA headers to shorten probeset
     # names for database
 
 #    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
 #    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
  
 #    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
 #       /cluster/bluearc/affy/
 
     # THE ABOVE WAS ALREADY TBD)
 
     # Set up cluster job to align MOE430 consensus sequences to mm8
     ssh kkr1u00
     cd /cluster/data/mm8/bed
     mkdir -p affyMOE430
     cd affyMOE430
 #    mkdir -p /iscratch/i/affy
 #    cp /cluster/bluearc/affy/MOE430_all.fa /iscratch/i/affy
 #    iSync
 
     ssh kk
     cd /cluster/data/mm8/bed/affyMOE430
     ls -1 /iscratch/i/affy/MOE430_all.fa > affy.lst
     echo /scratch/hg/mm8/nib/*.nib | wordLine stdin > genome.lst
 
     echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/mouse11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > template.sub
     gensub2 genome.lst affy.lst template.sub para.spec
     mkdir psl
     para create para.spec
     # Do the job with usual para try/check/push/time etc.
 # Completed: 34 of 34 jobs
 # CPU time in finished jobs:       9196s     153.26m     2.55h    0.11d  0.000 y
 # IO & Wait Time:                   362s       6.04m     0.10h    0.00d  0.000 y
 # Average job time:                 281s       4.69m     0.08h    0.00d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:             639s      10.65m     0.18h    0.01d
 # Submission to last job:           639s      10.65m     0.18h    0.01d
 
     # Do sort, best in genome filter, and convert to chromosome coordinates
     # to create affyRAE230.psl
     pslSort dirs raw.psl tmp psl
 
     # only use alignments that cover 30% of sequence and have at least
     # 95% identity in aligned region. 
     # low minCover as a lot of n's in these sequences
     pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl affyMOE430.psl /dev/null
 
     # Load alignments and sequences into database
     ssh hgwdev
     cd /cluster/data/mm8/bed/affyMOE430
     # shorten names in psl file
     sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
     mv affyMOE430.psl.bak affyMOE430.psl
 
     # load track into database
 
     hgLoadPsl mm8 affyMOE430.psl
  
     # Add consensus sequences for MOE430
     # Copy sequences to gbdb is they are not there already
 #    mkdir -p /gbdb/hgFixed/affyProbes
 #    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
 #       /gbdb/hgFixed/affyProbes
 
     hgLoadSeq -abbr=MOE430 mm8 /gbdb/hgFixed/affyProbes/MOE430_all.fa
     
     # Clean up
 #    rm batch.bak contig.psl raw.psl 
     
     # BELOW TWO THINGS WERE DONE BY RACHEL ALREDAY FOR MM4
     # add entry to trackDb.ra in ~kent/src/hg/makeDb/trackDb/mouse/
     # add affyMOE430.html file and then do make alpha to add to trackDb table
 
 # Create known gene mapping table and expression distance tables
 # for GNF Atlas 2.  (The hgExpDistance takes an hour.)
 
 hgMapToGene mm8 affyGnf1m knownGene knownToGnf1m
 hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
 	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
 Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
 Got 22937 unique elements in hgFixed.gnfMouseAtlas2MedianRatio
 
 # Create table that maps between known genes and RefSeq
 hgMapToGene mm8 refGene knownGene knownToRefSeq
 #	may need to build this command in src/hg/near/hgMapToGene
 
 # Create a table that maps between known genes and 
 # the nice affy expression data.
 hgMapToGene mm8 affyU74  knownGene knownToU74
 hgMapToGene mm8 affyMOE430 knownGene knownToMOE430
 hgMapToGene mm8 affyMOE430 -prefix=A: knownGene knownToMOE430A
 
 # Format and load Rinn et al sex expression data
 mkdir /cluster/data/mm8/bed/rinnSex
 cd /cluster/data/mm8/bed/rinnSex
 hgMapMicroarray rinnSex.bed hgFixed.mouseRinnSexMedianRatio \
 ../affyMOE430/affyMOE430.psl
 hgLoadBed mm8 rinnSex rinnSex.bed
 
 # Format and load the GNF data
 mkdir /cluster/data/mm8/bed/affyGnf95
 cd /cluster/data/mm8/bed/affyGnf95
 ~/src/hg/affyGnf/affyPslAndAtlasToBed -newType ../affyU95.psl \
 /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
 affyGnfU95.tab affyGnfU95Exps.tab -shortOut
 
 #	this .sql load was in preceeding instructions, but this .sql file
 #	appears to not exist and it doesn't seem to be needed anyway.
 #	Everything below this seems to create tables OK.
 #  hgsql mm8 < ~/kent/src/hg/affyGnf/affyGnfU95.sql
 
 # Create table that gives distance in expression space between 
 # GNF genes.  These commands take about 15 minutes each
 #	The affyGnfU74?Exps arguments appear to be unused in 
 # hgExpDistance
 cd /cluster/data/mm8/bed/geneSorter
 hgExpDistance mm8 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
 hgExpDistance mm8 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
 hgExpDistance mm8 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74
 
 # Create table to map between known genes and GNF Atlas2
 # expression data.
     hgMapToGene mm8 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
 #	hgsql -e "select count(*) from knownToGnfAtlas2;" mm8
 #	row count changed to 22978
 
 # Create expression distance table - takes about an hour
     hgExpDistance mm8 hgFixed.gnfMouseAtlas2MedianRatio \
     	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance \
 	-lookup=knownToGnfAtlas2 &
 #	hgsql -e "select count(*) from gnfAtlas2Distance;" mm8
 #	row count changed to 22937000 
 
 # HGNEAR PROTEIN BLAST TABLES (DONE 3/14/06 Fan)
 
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/hgNearBlastp
     cd /cluster/data/mm8/bed/hgNearBlastp
     cat << _EOF_ > config.ra
 # Latest mouse vs. other Gene Sorter orgs:
 # human, rat, zebrafish, worm, yeast, fly
 
 targetGenesetPrefix mouse
 targetDb mm8
 queryDbs hg18 rn4 danRer3 ce2 sacCer1 dm2
 
 mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa 
 hg18Fa /cluster/data/hg18/bed/geneSorter/blastp/known.faa
 rn4Fa /cluster/data/rn4/bed/blastp/known.faa
 danRer3Fa /cluster/data/danRer3/bed/blastp/ensembl.faa
 ce2Fa /cluster/data/ce2/bed/blastp/wormPep154.faa
 sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
 dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa
 
 buildDir /cluster/data/mm8/bed/hgNearBlastp
 scratchDir /san/sanvol1/scratch/mm8HgNearBlastp
 _EOF_
 
     doHgNearBlastp.pl config.ra >do.log 
 
 # output was like this:
 ...
 Scanning through 671 files^M
 Loading database with 14470 rows^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/mm8.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/hg18.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/rn4.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/danRer3.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/ce2.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/sacCer1.formatdb^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.split^M
 # ssh -x pk rm -rf /san/sanvol1/scratch/mm8HgNearBlastp/dm2.formatdb^M
 # ssh -x pk rmdir /san/sanvol1/scratch/mm8HgNearBlastp^M
 ^M
  *** All done!^M
  *** Check these tables in mm8:^M
  *** mouseBlastTab hgBlastTab rnBlastTab drBlastTab ceBlastTab scBlastTab dmBlastTab ^M
  *** and mmBlastTab in these databases:^M
  *** hg18 rn4 danRer3 ce2 sacCer1 dm2 ^M
 
 # MAKE ORGANISM-SPECIFIC HGNEARDATA FILES 
     cd ~/kent/src/hg/near/hgNear/hgNearData
     mkdir -p Mouse/mm8
     cd Mouse/mm8
     cp ../mm7/otherOrgs.ra
 # Edit ortherOrgs.ra to reflect the latest genomes used in blastp jobs
     vi ortherOrgs.ra
 # then check it into CVS.
 
 # ENABLE HGNEAR FOR mm8 IN HGCENTRALTEST
     echo "update dbDb set hgNearOk = 1 where name = 'mm8';" \
       | hgsql -h genome-testdb hgcentraltest
 
 # END OF HGNEAR STUFF
 
 #########################################################################
 # BLASTZ panTro2 after chr9 re-masked (DONE - 2006-03-30 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzPanTro2.2006-03-28
     cd /cluster/data/mm8/bed
     rm blastz.panTro2
     ln -s blastzPanTro2.2006-03-28 blastz.panTro2
     cd blastz.panTro2
 
     cat << '_EOF_' > DEF
 # mouse vs chimp
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm7
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/hg/panTro2/nib
 SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
 SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-28
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	broken during blastz run due to panassas failure
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cat `pwd`/DEF > cat.out 2>&1 &
     
     #	Do not have this measurement for the first time around, tables
     #	got loaded again before I thought of that.
     time nice -n +19 featureBits mm8 chainPanTro2Link \
 	> fb.mm8.chainPanTro2Link
     #	963977790 bases of 2567283971 (37.549%) in intersection
 
     #	For panTro1 this was:
     time nice -n +19 featureBits mm8 chainPanTro1Link \
 	> fb.mm8.chainPanTro1Link
     #	901276629 bases of 2567283971 (35.106%) in intersection
 
     ssh pk
     mv /cluster/data/panTro2/bed/blastz.mm8.swap \
 	/cluster/data/panTro2/bed/blastz.mm8.swap.2006-03-21
     mkdir /cluster/data/panTro2/bed/blastz.mm8.swap
     cd /cluster/data/panTro2/bed/blastz.mm8.swap
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	/cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
 	> blastz.out 2>&1 &
     #	completed the downloads manually since they failed due to the
     #	existing downloads.  Then cleanup:
     ssh hgwbeta
     cd /cluster/data/panTro2/bed/blastz.mm8.swap
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=cleanup /cluster/data/mm8/bed/blastzPanTro2.2006-03-28/DEF \
 	> cleanup.out 2>&1 &
 
     time nice -n +19 featureBits panTro2 chainMm8Link \
 	> fb.panTro2.chainMm8Link 2>&1 &
     #	978002566 bases of 2909512873 (33.614%) in intersection
     # first time before the chr9 fix was:
     #	986978326 bases of 2909512873 (33.922%) in intersection
 
 #########################################################################
 # BLASTZ panTro2 (DONE - 2006-03-15 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.panTro2.2006-02-23
     cd /cluster/data/mm8/bed
     ln -s blastz.panTro2.2006-02-23 blastz.panTro2
     cd blastz.panTro2
 
     cat << '_EOF_' > DEF
 # mouse vs chimp
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm7
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_SMSK=/cluster/bluearc/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Chimp PanTro2
 SEQ2_DIR=/scratch/hg/panTro2/nib
 SEQ2_LEN=/scratch/hg/panTro2/chrom.sizes
 SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzPanTro2.2006-03-15
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	broken during chain step due to missing files on the Iservers
     #	completed chain run manually, then continuing
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-continue=chainMerge `pwd`/DEF > chainMerge.out 2>&1 &
     #	broken during loadUp due to script bug, ran loadUp.csh manually
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap `pwd`/DEF > swap.out 2>&1 &
     
     #	mistakenly did PanTro1 here ...  should have been PanTro2
     time nice -n +19 featureBits mm8 chainPanTro1Link
     #	901276629 bases of 2567283971 (35.106%) in intersection
     time nice -n +19 featureBits panTro2 chainMm8Link \
 	> fb.panTro2.chainMm8Link 2>&1
     #	986978326 bases of 2909512873 (33.922%) in intersection
 
 
 #############################################################################
 # UPDATED mm8.knownToVisiGene (2006-03-15 galt)
 ssh hgwdev
 knownToVisiGene mm8
 
 #############################################################################
 # BLASTZ SELF (DONE - 2006-03-20 - 2006-03-22 - Hiram)
 #	using chain min score of 10,000 to cut down on volumn of data
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzSelf.2006-03-20
     cd /cluster/data/mm8/bed
     ln -s blastzSelf.2006-03-20 blastz.mm8
     cd blastzSelf.2006-03-20
 
     cat << '_EOF_' > DEF
 # mouse vs mouse
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_H=2000
 BLASTZ_M=200
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse Mm8
 SEQ2_DIR=/scratch/hg/mm8/nib
 SEQ2_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzSelf.2006-03-20
 TMPDIR=/scratch/tmp
 '_EOF_'
     #	<< happy emacs
 
     cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
     time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	broke during the load step due to doBlastz script changes,
     #	finished the load manually, then:
     time /cluster/home/hiram/kent/src/utils/doBlastzChainNet.pl -verbose=2 \
 	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=pk \
 	-continue=download `pwd`/DEF > download.out 2>&1 &
 
     ssh kolossus
     cd /cluster/data/mm8/bed/blastzSelf.2006-03-20
     time HGDB_CONF=~/.hg.conf.read-only featureBits mm8 \
 	chainSelfLink >fb.mm8.chainSelfLink 2>&1
     cat fb.mm8.chainSelfLink
     #	362483673 bases of 2567283971 (14.119%) in intersection
 
 #############################################################################
 # UPDATED mm8.knownToVisiGene (2006-04-05 galt)
 ssh hgwdev
 knownToVisiGene mm8
 
 
 ############################################################################
 # LIFTOVER (DROPUNDER) CHAINS TO MM7 (2006-04-06 kate)
 
     # Split (using makeLoChain-split) of mm7 is doc'ed in makeMm7.doc
     # Do what makeLoChain-split says to do next (start blat alignment)
     ssh kk
     cd /cluster/data/mm8/bed/liftOver
     makeLoChain-align mm8 /scratch/hg/mm8/nib mm7 \
         /iscratch/i/mm7/split10k \
         /cluster/bluearc/mm7/11.ooc >&! align.log &
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/run
     para shove
     para time >&! run.time
 #CPU time in finished jobs:     906023s   15100.39m   251.67h   10.49d  0.029 y
 #IO & Wait Time:                 22074s     367.90m     6.13h    0.26d  0.001 y
 #Average job time:                 343s       5.72m     0.10h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            4260s      71.00m     1.18h    0.05d
 #Submission to last job:          4965s      82.75m     1.38h    0.06d
 
     # lift alignments
     ssh kkr1u00
     cd /cluster/data/mm8/bed/liftOver
     makeLoChain-lift mm8 mm7 >&! lift.log &
 
     # chain alignments
     ssh kki
     cd /cluster/data/mm8/bed/liftOver
     makeLoChain-chain mm8 /scratch/hg/mm8/nib \
                 mm7 /scratch/hg/mm7/nib >&! chain.log &
     # Do what its output says to do next (start cluster job)
     cd /cluster/data/mm8/bed/blat.mm7.2006-04-06/chainRun
     para shove
     para time >&! run.time
 #CPU time in finished jobs:       3884s      64.73m     1.08h    0.04d  0.000 y
 #IO & Wait Time:                   594s       9.91m     0.17h    0.01d  0.000 y
 #Average job time:                  86s       1.44m     0.02h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:             245s       4.08m     0.07h    0.00d
 #Submission to last job:           401s       6.68m     0.11h    0.00d
 
     # net alignment chains
     ssh kkstore03
     cd /cluster/data/mm8/bed/liftOver
     makeLoChain-net mm8 mm7 >&! net.log &
 
     # load reference to over.chain into database table, 
     # and create symlinks  /gbdb  and download area
     ssh hgwdev
     cd /cluster/data/mm8/bed/liftOver
     makeLoChain-load mm8 mm7 >&! load.log &
 
     # test by converting a region using the "convert" link on
     # the browser, and comparing to blat of the same region
 
 #############################################################################
 # Create Allen Brain Atlas mapping. (DONE 2006-04-12 galt)
 
 # compile allenCollectSeq
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq
     make
 
 # Set up directory
     ssh kk
     cd /cluster/data/mm8/bed
     mkdir allenBrain
     cd allenBrain
 
 # In /san/sanvol1/visiGene/offline/allenBrain/probesAndData/
 #  allen20051021.tab (converted from spreadsheet mailed by Susan Sunkin <SusanS@alleninstitute.org>)
 #  probeSeq.20051027.fasta (also from Susan).
 
 # Create a list of probe sequences filling ones missing from probeSeq.20050127.fa
 # with some NCBI and TIGR files, and some downloaded one at a time.
      allenCollectSeq /san/sanvol1/visiGene/offline/allenBrain/probesAndData/allen20051021.tab /san/sanvol1/visiGene/offline/allenBrain/probesAndData/probeSeq.20051027.fasta /cluster/data/mm7/bed/ncbiXm/ncbiNm.fa /cluster/data/mm7/bed/ncbiXm/ncbiXm.fa /cluster/data/mm6/bed/tigrMgiTc/tigrMgiTc.fa ~/kent/src/hg/makeDb/allenBrain/allenCollectSeq/extra.fa allProbes.fa allProbes.tab missing.tab allenBrainUrl.tab
     
 # Set up a blat run to align the probes.
     mkdir split
     faSplit sequence allProbes.fa 200 split/rp
     mkdir run
     cd run
     ls -1 ../split/*.fa > mrna.lst
     ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
     mkdir psl
     cat << '_EOF_' > gsub
 #LOOP
 blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 # Then do the usual para try/push/time/check until the run is finished
 
 # Then do sorting and near-best-in-genome step on file server
     ssh kkstore02
     cd /cluster/data/mm8/bed/allenBrain/run
     pslSort dirs raw.psl tmp psl
     pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
     sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl
 
 # Clean up big files no longer needed
    rm raw.psl
    rm -r psl
    rm -r ../split
 
 # Load up database
    ssh hgwdev
    cd /cluster/data/mm8/bed/allenBrain
 
 # Make a new table that contains the URLs for the allen brain genes
 # Make this one first since all.joiner considers it the master table.
    hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
    hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'
 
 # Make probe alignment table, and load sequence.
    hgLoadPsl mm8 allenBrainAli.psl
    mkdir /gbdb/mm8/allenBrain
    ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
    hgLoadSeq mm8 /gbdb/mm8/allenBrain/allProbes.fa
 
 # Make mapping between known genes and allenBrain	
    hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain 
 
 #########################################################################
 # BLASTZ HUMAN Hg17 (DONE - 2006-04-13 - 2006-04-19 - Hiram)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastzHg17.2006-04-13
     cd /cluster/data/mm8/bed
     ln -s blastzHg17.2006-04-13 blastz.hg17
     cd blastzHg17.2006-04-13
 
     cat << '_EOF_' > DEF
 # mouse vs human
 export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin
 
 BLASTZ=blastz.v7.x86_64
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_SMSK=/scratch/hg/mm8/linSpecRep/notInOthers
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Human Hg17 - single chunk big enough to run each chrom by itself
 SEQ2_DIR=/scratch/hg/hg17/bothMaskedNibs
 SEQ2_SMSK=/scratch/hg/hg17/linSpecRep.notInMouse
 SEQ2_LEN=/cluster/data/hg17/chrom.sizes
 SEQ2_CHUNK=300000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastzHg17.2006-04-13
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     #	establish a screen to control this job
     screen
     cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	real    656m20.633s
 
     #	Then to swap over to Hg17
     mkdir /cluster/data/hg17/bed/blastz.mm8.swap
     cd /cluster/data/hg17/bed
     ln -s blastz.mm8.swap blastz.mm8
     cd blastz.mm8.swap
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
 	-swap /cluster/data/mm8/bed/blastzHg17.2006-04-13/DEF \
 	> swap.out 2>&1 &
 
     ssh hgwdev
     time nice -n +19 featureBits mm8 chainHg17Link
     #	984380268 bases of 2567283971 (38.343%) in intersection
     time nice -n +19 featureBits hg17 chainMm8Link
     #	994530172 bases of 2881515245 (34.514%) in intersection
     cd /cluster/data/mm8/bed/blastzHg17.2006-04-13
     time nice -n +19 featureBits mm8 chainHg17Link > fb.mm8.chainHg17Link 2>&1
     #	990554882 bases of 2567283971 (38.584%) in intersection
     time nice -n +19 featureBits hg17 chainMm8Link > fb.hg17.chainMm8Link 2>&1
     #	997368618 bases of 2866216770 (34.797%) in intersection
 
 ########################################################################
 # BLASTZ/CHAIN/NET XENTRO2 (DONE - 2006-04-20 - Hiram)
     ssh kk
     mkdir /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
     cd /cluster/data/mm8/bed
     ln -s blastz.xenTro2.2006-04-20 blastz.xenTro2
     cd blastz.xenTro2.2006-04-20
 
     cat << '_EOF_' > DEF
 # mouse vs. frog
 BLASTZ=/cluster/bin/penn/blastz.v7
 
 # Use same params as used for mammal-xenTro1 (see makeXenTro1.doc)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/cluster/data/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Frog xenTro2 - single chunk big enough to run two of the
 #               largest scaffolds in one job
 SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LAP=0
 SEQ2_LIMIT=100
 
 BASE=/cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
 '_EOF_'
     # << emacs
 
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	`pwd`/DEF > blastz.out 2>&1 &
     #	 XXX running 2006-04-20
 
 
     #	Then to swap over to xenTro2
     mkdir /cluster/data/xenTro2/bed/blastz.mm8.swap
     cd /cluster/data/xenTro2/bed
     ln -s blastz.mm8.swap blastz.mm8
     cd blastz.mm8.swap
     time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
 	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20/DEF \
 	> swap.out 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastz.xenTro2.2006-04-20
     time nice -n +19 featureBits mm8 chainXenTro2Link \
 	> fb.mm8.chainXenTro2Link 2>&1 &
     #	68050843 bases of 2567283971 (2.651%) in intersection
     cd /cluster/data/xenTro2/bed/blastz.mm8.swap
     time nice -n +19 featureBits xenTro2 chainMm8Link \
 	> fb.xenTro2.chainMm8Link 2>&1
     #	72840135 bases of 1359412157 (5.358%) in intersection
 
 #######################################################################
 ## LIFTOVER To Mm7 (DONE - 2006-04-21 - 2006-04-24 - Hiram)
     ssh kkr1u00
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
       mm7 /cluster/data/mm7/nib
     # as it says, DO THIS NEXT:
     ssh kk
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
 	mm8 /scratch/hg/mm8/nib mm7 /iscratch/i/mm7/split10k \
 	/cluster/data/mm7/11.ooc
     # as it says, DO THIS NEXT:
     cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/run
     para try, check, push, check, ...
 # Completed: 1360 of 1360 jobs
 # CPU time in finished jobs:    3890058s   64834.31m  1080.57h   45.02d  0.123 y
 # IO & Wait Time:                 13326s     222.09m     3.70h    0.15d  0.000 y
 # Average job time:                2870s      47.84m     0.80h    0.03d
 # Longest finished job:           27224s     453.73m     7.56h    0.32d
 # Submission to last job:         80553s    1342.55m    22.38h    0.93d
 
     # as it says, DO THIS NEXT:
     #	this does the liftUp and makes the psl files
     ssh kkr1u00
     cd /cluster/data/mm8/bed
     ln -s blat.mm7.2006-04-21 blat.mm7
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh mm8 mm7
     #	real    16m5.091s
     # as it says, DO THIS NEXT:
     #	the prepares the batch to run for the chaining
     ssh kki
     time $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
 	mm8 /cluster/data/mm8/nib mm7 /cluster/data/mm7/nib
     # as it says, DO THIS NEXT:
     #	running the chain batch
     cd /cluster/data/mm8/bed/blat.mm7.2006-04-21/chainRun
     para try, check, push, check, ...
     Completed: 40 of 40 jobs
 # CPU time in finished jobs:       5381s      89.68m     1.49h    0.06d  0.000 y
 # IO & Wait Time:                  2119s      35.32m     0.59h    0.02d  0.000 y
 # Average job time:                 188s       3.12m     0.05h    0.00d
 # Longest finished job:             652s      10.87m     0.18h    0.01d
 # Submission to last job:           685s      11.42m     0.19h    0.01d
     ssh kkstore04
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh mm8 mm7
     #	Created /cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz
     # as it says, DO THIS NEXT:
     ssh hgwdev
     $HOME/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh mm8 mm7
     #	It says this:
     # 	Now, add link for
     #	/usr/local/apache/htdocs/goldenPath/mm8/liftOver/mm8ToMm7.over.chain
     #	to hgLiftOver
     #	But I believe that link was already done:
     cd /gbdb/mm8/liftOver
     ls -og mm8ToMm7*
     #	lrwxrwxrwx    1       53 Apr 24 12:32 mm8ToMm7.over.chain.gz -> \
     #		/cluster/data/mm8/bed/liftOver/mm8ToMm7.over.chain.gz
 
 ########################################################################
 ##  CYTOBAND - ideogram track (DONE - 2006-04-28 - Hiram)
     ssh hgwdev
     cd /cluster/data/mm8/pre_release
     #	The .wgetrc is the anonymous user
     WGETRC=`pwd`/.wgetrc
     export WGETRC
     wget --timestamping \
 	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release/ideogram
 
     mkdir /cluster/data/mm8/cytoBand
     cd /cluster/data/mm8/cytoBand
 
     # Create bed file
     $HOME/kent/src/utils/createNcbiCytoBand.pl \
 	/cluster/data/mm8/pre_release/ideogram
     # Load the bed file
     hgLoadBed -strict -noBin \
 	-sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql mm8 \
 	cytoBand cytoBand.bed
     # Make cytoBandIdeo track for ideogram gif on hgTracks page.
     # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
     hgsql -e "drop table cytoBandIdeo;" mm8
     hgsql mm8 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"
 
 
 #########################################################################
 # GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
     ssh kkstore04
     #	Create a 2bit file with the full chrom sequences and the
     #	random contigs, all hard masked
     cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
 	| maskOutFa stdin hard stdout \
 	    | faToTwoBit stdin mm8Chroms_RandomContigs.hard.2bit
     #	make sure it still has all the unmasked sequence in it:
     twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
 	| faSize stdin
     #	2661205088 bases (1183272085 N's 1477933003 real 1477933003
     #	upper 0 lower) in 99 sequences in 1 files
     twoBitToFa mm8.2bit stdout | faSize stdin
     #	2664455088 bases (97171400 N's 2567283688 real 1477933003 upper
     #	1089350685 lower) in 34 sequences in 1 files
     #	note the 'real' bases are the same, the lowers have become N's
     #	1089350685 + 97171400 = 1186522085 
     #	1186522085 - 1183272085  = 3250000 == N's in gaps between contigs
 
     #	And, make sure there aren't any sequences in this lot that have
     #	become all N's with no sequence left in them:
     twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
 	| faCount stdin > chroms_randoms.faCount
     #	the lowest three are:
     egrep -v "^#|^total" chroms_randoms.faCount \
 	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
     #	MmUn_162590_36 1631
     #	Mm1_163269_36 1581
     #	MmUn_102813_36 1479
 
     #	creating 4,000,000 sized chunks, the chroms stay together as
     #	single pieces.  The contigs get grouped together into 4,000,000
     #	sized fasta files.  You don't want to break these things up
     #	because genscan will be doing its own internal 2.4 million
     #	window on these pieces, and the gene names are going to be
     #	constructed from the sequence name in these fasta files.  The
     #	gene names are much better when they are this simple chrN.M
     #	numbering scheme, or in the case of a contig: contig_name.M
     #	where the M is a sequence number that genscan will assign to
     #	each gene it discovers.
     mkdir hardChunks
     twoBitToFa mm8Chroms_RandomContigs.hard.2bit stdout \
 	| faSplit about stdin 4000000 hardChunks/c_
     rsync -a --progress hardChunks/ /cluster/bluearc/mm8/hardChunks/
 
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/genscan
     cd /cluster/data/mm8/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/mm8/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1S /cluster/bluearc/mm8/hardChunks/c_*.fa > genome.list
 
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << emacs
     gensub2 genome.list single template jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 673 of 673 jobs
 # CPU time in finished jobs:      76339s    1272.32m    21.21h    0.88d  0.002 y
 # IO & Wait Time:                  2327s      38.78m     0.65h    0.03d  0.000 y
 # Average job time:                 117s       1.95m     0.03h    0.00d
 # Longest finished job:            1993s      33.22m     0.55h    0.02d
 # Submission to last job:          7526s     125.43m     2.09h    0.09d
 
     #	There was a failed job, going to kolossus and running with a
     #	reduced window size:
     ssh kolossus
     cd /cluster/data/mm8/bed/genscan
     time /cluster/bin/x86_64/gsBig /cluster/bluearc/mm8/hardChunks/c_01.fa \
         gtf/c_01.gtf -trans=pep/c_01.pep -subopt=subopt/c_01.bed \
         -exe=hg3rdParty/genscanlinux/genscan \
         -par=hg3rdParty/genscanlinux/HumanIso.smat \
 	-tmp=/scratch/tmp -window=2000000
     #	real    258m34.800s
 
     # cat and lift the results into single files
     ssh kkstore04
     cd /cluster/data/mm8/bed/genscan
     cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
 	../../jkStuff/liftAll.lft carry stdin
     cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
 	../../jkStuff/liftAll.lft carry stdin
     cat pep/c_*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/mm8/bed/genscan
     ldHgGene mm8 -gtf genscan genscan.gtf
     #	Read 44899 transcripts in 323099 lines in 1 files
     #	44899 groups 34 seqs 1 sources 1 feature types
     #	44899 gene predictions
 
     hgPepPred mm8 generic genscanPep genscan.pep
     hgLoadBed -strict mm8 genscanSubopt genscanSubopt.bed
     #	Loaded 530201 elements of size 6
 
     #	check the numbers
     time nice -n +19 featureBits mm8 genscan
     #	54455852 bases of 2567283971 (2.121%) in intersection
     time nice -n +19 featureBits mm8 knownGene:cds
     #	28459053 bases of 2567283971 (1.109%) in intersection
     featureBits mm7 genscan
     #	54864694 bases of 2583394090 (2.124%) in intersection
     time nice -n +19 featureBits mm7 knownGene:cds
     #	27531524 bases of 2583394090 (1.066%) in intersection
     featureBits mm6 genscan
     #	54894283 bases of 2597150411 (2.114%) in intersection
     featureBits mm5 genscan
     #	55024722 bases of 2615483787 (2.104%) in intersection
     featureBits mm4 genscan
     #	56164126 bases of 2627444668 (2.138%) in intersection
     featureBits mm3 genscan
     #	51697165 bases of 2505900260 (2.063%) in intersection
 
     featureBits mm8 genscanSubopt
     #	57048581 bases of 2567283971 (2.222%) in intersection
     featureBits mm7 genscanSubopt
     #	57512333 bases of 2583394090 (2.226%) in intersection
     featureBits mm6 genscanSubopt
     #	57856316 bases of 2597150411 (2.228%) in intersection
     featureBits mm5 genscanSubopt
     #	58474899 bases of 2615483787 (2.236%) in intersection
     featureBits mm4 genscanSubopt
     #	59601009 bases of 2627444668 (2.268%) in intersection
     featureBits mm3 genscanSubopt
     #	56085184 bases of 2505900260 (2.238%) in intersection
 
 ##########################################################################
 # BUILD NIBB IMAGE PROGES (in progress 2007-05-05 Jim)
 
 # Make directory on san for cluster job and copy in sequence
     ssh pk
     mkdir /san/sanvol1/scratch/mm8/nibbPics
     cd /san/sanvol1/scratch/mm8/nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
 
 # Make parasol job dir and sequence list files
     mkdir run
     cd run
     mkdir psl
     ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
     echo ../nibbImageProbes.fa > mrna.lst
 
 # Create parasol gensub file file
 cat << '_EOF_' > gsub
 #LOOP
 blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
 #ENDLOOP
 '_EOF_'
 
 # Create parasol batch
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 
 # Do para try/push/time etc.
 #Completed: 49 of 49 jobs
 #CPU time in finished jobs:      12585s     209.74m     3.50h    0.15d  0.000 y
 #IO & Wait Time:                   411s       6.86m     0.11h    0.00d  0.000 y
 #Average job time:                 265s       4.42m     0.07h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:            1145s      19.08m     0.32h    0.01d
 #Submission to last job:          1195s      19.92m     0.33h    0.01d
 
 
 # Make sort and filter
     catDir psl | sort -k 10 \
         | pslReps stdin stdout /dev/null -nohead -minAli=0.60 -nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
 	| sort -k 14,14 -k 16,16n \
 	| sed 's#/scratch/hg/mm8/nib/chr#chr#' \
 	| sed 's/.nib//' > ../nibbImageProbes.psl
 
 # Make bed file and copy in stuff
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir nibbPics
     cd nibbPics
     cp /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
     cp /san/sanvol1/scratch/mm8/nibbPics/nibbImageProbes.psl .
 
 # Load into database
     ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa /gbdb/mm8/nibbImageProbes.fa
     hgLoadSeq mm8 /gbdb/mm8/nibbImageProbes.fa
     hgLoadPsl mm8 nibbImageProbes.psl
 
 
 #############################################################################
 #  miRNA track (DONE - 2006-05-22 - Fan)
     #   data from: Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir miRNA-2006-05-22
     cd miRNA-2006-0522
     # save the mm8_miRNA_track_may2006.txt file from email
     cat mm8_miRNA_track_may2006.txt|sed -e 's/ /\t/g' >miRNA.tab
 
     hgLoadBed -strict mm8 miRNA miRNA.tab
 
 # check previous release track before update
     featureBits mm8 miRNA
     #   28630 bases of 2567283971 (0.001%) in intersection
     featureBits mm7 miRNA
     #   20620 bases of 2583394090 (0.001%) in intersection
     featureBits mm6 miRNA
     #   21167 bases of 2597150411 (0.001%) in intersection
     featureBits mm5 miRNA
     #   17957 bases of 2615483787 (0.001%) in intersection
 
 
 #########################################################################
 # BLASTZ CHICKEN galGal3 (DONE 5/24/06 angie)
     ssh pk
     mkdir /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
     cd /cluster/data/mm8/bed/blastz.galGal3.2006-05-23
     cat << '_EOF_' > DEF
 # mouse vs chicken
 BLASTZ=blastz.v7.x86_64
 
 # Specific settings for chicken (per Webb email to Brian Raney)
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=10000
 BLASTZ_K=2200
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 BLASTZ_ABRIDGE_REPEATS=1
 
 # TARGET: Mouse mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_SMSK=/san/sanvol1/scratch/mm8/linSpecRep/notInNonMammal
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
 SEQ2_DIR=/san/sanvol1/galGal3/nib
 SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
 SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
 SEQ2_CHUNK=200000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastz.galGal3.2006-05-23
 '_EOF_'
     # << emacs
     doBlastzChainNet.pl DEF -blastzOutRoot /san/sanvol1/scratch/gg3vsmm8 \
       -bigClusterHub=pk -smallClusterHub=pk \
       -chainMinScore=5000 -chainLinearGap=loose \
       >& do.log & tail -f do.log
     ln -s blastz.galGal3.2006-05-23 /cluster/data/mm8/bed/blastz.galGal3
 
 #########################################################################
 # ADD LINK TO GENENETWORK (DONE. 5/31/06 Fan).
 
 # Copy geneNetwork ID list from mm7
 
     ssh hgwdev
     mkdir -p /cluster/data/mm8/bed/geneNetwork
     cd /cluster/data/mm8/bed/geneNetwork
 
     hgsql mm7 -N -e 'select * from geneNetworkId' > geneNetworkId.tab
 
     hgsql mm8 -e 'drop table geneNetworkId'
     hgsql mm8 < ~/src/hg/lib/geneNetworkId.sql
     hgsql mm8 -e \
     'load data local infile "geneNetworkId.tab" into table geneNetworkId'
 
 ############################################################################
 # SGP GENES (DONE - 2006-06-12 - Hiram)
     ssh kkstore02
     cd  /cluster/data/mm8/bed
     ln -s /cluster/store8/mm8/bed/sgp .
     cd sgp
     #   They don't do chrM
     for C in `awk '{print $1}' /cluster/data/mm8/chrom.sizes | grep -v chrM`
     do
         wget --timestamping \
 "http://genome.imim.es/genepredictions/M.musculus/mmMar2006/SGP/humangp200603/${C}.gtf" \
         -O "${C}.gtf"
     done
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/sgp
     ldHgGene -gtf -genePredExt mm8 sgpGene chr*.gtf
 
     featureBits mm8 -enrichment refGene:CDS sgpGene
     #	refGene:CDS 1.063%, sgpGene 1.455%, both 0.918%, cover 86.32%,
     #	enrich 59.32x
 
 #########################################################################
 # BUILD KNOWN GENE LIST FOR GOOGLE. (DONE.  6/6/06 Fan).
 
     cd /cluster/data/mm8/bed
     rm -rf knownGeneList/mm8
 
 # Run hgKnownGeneList to generate the tree of HTML pages
 # under ./knownGeneList/mm8
 
     hgKnownGeneList mm8
 
     # copy over to /usr/local/apache/htdocs
 
     rm -rf /usr/local/apache/htdocs/knownGeneList/mm8
     mkdir -p /usr/local/apache/htdocs/knownGeneList/mm8
     cp -Rfp knownGeneList/mm8/* /usr/local/apache/htdocs/knownGeneList/mm8
 
 
 #########################################################################
 ### IGTC (Int'l GeneTrap Consortium) (DONE - 2006-06-12 - angie)
 ### fasta added 2006-06-21
 ### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab
 
 ### NOTE -- as of 2007-03-01 the igtc track will be automatically 
 ### updated on hgwdev by the scripts monthlyUpdateIgtc.csh and 
 ### updateIgtc.pl in kent/src/hg/utils/automation/ .
 
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/igtc
     cd /cluster/data/mm8/bed/igtc
     wget http://www.genetrap.org/blattrack/genetrap_mm8.psl
     grep -v ^track genetrap_mm8.psl \
     | hgLoadPsl mm8 -table=igtc stdin
     # Probe fasta is shared by all assemblies:
     wget http://www.genetrap.org/blattrack/genetrap.fasta
     mkdir /gbdb/mm8/igtc
     ln -s /cluster/data/mm8/bed/igtc/genetrap.fasta /gbdb/mm8/igtc/
     hgLoadSeq -replace mm8 /gbdb/mm8/igtc/genetrap.fasta
 
 
 #########################################################################
 # REGULATORY POTENTIAL (DONE - 2006-06-12 - Hiram)
     #	download data from "James Taylor" <james@bx.psu.edu>
     ssh kkstore04
     cd /cluster/data/mm8/bed
     mkdir /cluster/store8/mm8/bed/regPotential7X
     ln -s /cluster/store8/mm8/bed/regPotential7X .
     cd regPotential7X
     
     #	This is a lot of data
     time for C in 1 2 3 4 5 6 7 8 9 X 10 11 12 13 14 15 16 17 18 19
     do
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/chr${C}.scores.truncated.bz2"
     done
     #	real    79m32.840s
 
     wget --timestamping \
 "http://www.bx.psu.edu/~james/esperr_rp_7way_scores/genome_scores_mm8/trackDb.html" -O description.html
 
     time for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X
     do
 	bzcat chr${C}.scores.truncated.bz2
     done | wigEncode -noOverlap  stdin regPotential7X.wig regPotential7X.wib
     #	Converted stdin, upper limit 1.00, lower limit 0.00
     #	real    22m28.583s
 
     #	Loading the table on hgwdev
     ssh hgwdev
     cd /cluster/data/mm8/bed/regPotential7X
     ln -s /cluster/data/mm8/bed/regPotential7X/regPotential7X.wib \
 	/gbdb/mm8/wib/regPotential7X.wib
     #	using the tmpDir is faster since it is on local disk and it will
     #	clean up any temporary .tab file it creates there
     time hgLoadWiggle -tmpDir=/scratch/tmp \
 	mm8 regPotential7X regPotential7X.wig
     #	real    0m28.683s
 
     #	create a histogram
     ssh kolossus
     cd /cluster/data/mm8/bed/regPotential7X
     time hgWiggle -verbose=2 -doHistogram -hBinSize=0.01 -hBinCount=100 \
 	-hMinVal=0.0 -db=mm8 regPotential7X > histogram.data 2>&1
     #	real    18m29.167s
 
     #	create download gzip files from the bz2 files:
     ssh kkstore04
     cd /cluster/data/mm8/bed/regPotential7X
 
     for F in chr*.scores.truncated.bz2
     do
 	C=`echo $F | awk -F'.' '{print $1}'`
 	echo -n "${C}.regPotential7X.mm8.gz working ... "
 	bzcat ${F} | gzip > ${C}.regPotential7X.mm8.gz
 	echo
     done
 
 #############################################################################
 # SIB Transcriptome (DONE Aug 29, 2007 - JK)
 
    # Create working directory and download data from where Christian Iseli
    # (Christian.Iseli@licr.org) put it, and unpack.  The download takes about
    # ten minutes (161M file).
    cd /cluster/data/mm8/bed
    mkdir sibTranscriptome
    cd sibTranscriptome
    wget ftp://ftp.licr.org/pub/databases/trome/mouse/MTR.gtf.gz
    wget ftp://ftp.licr.org/pub/databases/trome/mouse/txg.tar.gz
    tar -zxvf txg.tar.gz
 
    # Load up sibGene table
    zcat MTR.gtf.gz | ldHgGene mm8 sibGene stdin
 
    # Do a little data cleanup and transformation and load splice graphs into database.
    sed 's/altGraphX/sibTxGraph/' ~/src/hg/lib/altGraphX.sql > sibTxGraph.sql
    cat txg/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm8 sibTxGraph stdin
 
    # Create sibAltEvents track for analysed alt-splices.
    cat txg/*.txg | txgAnalyze stdin /cluster/data/mm8/mm8.2bit sibAltEvents.bed
    awk '$2 >= 0' sibAltEvents.bed | sort | uniq > foo.bed
    hgLoadBed mm8 sibAltEvents foo.bed
 
 
 #########################################################################
 # MAP CONTIGS TRACK (DONE - 2005-10-04 - Hiram)
     ssh hgwdev
     mkdir -p /cluster/data/mm8/bed/ctgPos
     cd /cluster/data/mm8/bed/ctgPos
     # hgCtgPos uses the lift files... but mouse lift files are for the
     # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs
     # from the assembly.  (In the future, we should go with the NT's!)
     # So... just for this release, go straight from the seq_contig.md
     # to the table def'n: contig, size, chrom, chromStart, chromEnd
     #	This script is an improvement from before, this is now doing the
     #	randoms properly.
     cat << '_EOF_' > seqContigToCtgPos.pl
 #!/usr/bin/env perl
 
 use warnings;
 use strict;
 
 my $prevRandom="";
 my $randomPosition=0;
 
 while(my $line=<>)
 {
 chomp($line);
 my @a = split('\s+',$line);
 if ($a[1] =~ m/\|/)
     {
     my @b = split('\|',$a[1]);
     if ($b[0] ne $prevRandom)
 	{
 	$randomPosition=0;
 	$prevRandom=$b[0];
 	}
     my $size = $a[3]-$a[2]+1;
     my $start = $randomPosition;
     my $end = $randomPosition + $size;
     printf "%s\t%d\tchr%s_random\t%d\t%d\n", $a[5],$size,$b[0],$start,$end;
     if ($b[0] ne "Un") { $randomPosition += 50000; }
 	else { $randomPosition += 50000; }
     $randomPosition += $size;
     }
 elsif ($a[5] =~ m/^N[TC]_\d+$/)
     {
     my $start = $a[2]-1;
     my $end = $a[3];
     my $size = $end-$start;
     printf "%s\t%d\tchr%s\t%d\t%d\n", $a[5],$size,$a[1],$start,$end;
     }
 }
 '_EOF_'
     #	<< emacs happy
     chmod +x seqContigToCtgPos.pl
 
     egrep "ref_strain|C57BL" ../../seq_contig.md \
 	| ./seqContigToCtgPos.pl > ctgPos.tab
 
     cat ../../seq_contig.md | ./seqContigToCtgPos.pl > ctgPos.tab
 
     hgsql mm8 -e "drop table ctgPos;"
     hgsql mm8 < ~/kent/src/hg/lib/ctgPos.sql
     hgsql mm8 -e 'load data local infile "ctgPos.tab" into table ctgPos;'
 
     featureBits -countGaps mm8 ctgPos
     #	2573322222 bases of 2664455088 (96.580%) in intersection
     featureBits -countGaps mm7 ctgPos
     #	2608810329 bases of 2847717329 (91.611%) in intersection
     featureBits -countGaps mm6 ctgPos
     #	2638893452 bases of 3079633452 (85.689%) in intersection
     featureBits -countGaps mm5 ctgPos
     #	2557081173 bases of 3164952073 (80.794%) in intersection
 
 #####################################################################
 #### LOAD ENSEMBL GENES (DONE - 2006-06-21 - Hiram)
 # ADDED PEPTIDE TABLE, ENSPEP (DONE, 2006-07-11, hartera)
 # ADDDED STABLE URL TO TRACKDB BLOCK (V39, JUN 2006) (2008-01-10, rhead)
    mkdir /cluster/data/mm8/bed/ensGene
    cd /cluster/data/mm7/bed/ensGene
 
         Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
         Choose Ensembl 39 and Mus musculus, click next
 	It displays status in a window on the right, indicating how many
 	entries are here, currently: 27,967
     	The next page is the "filter" step, we do not want any filters,
 	nothing is changed on this page, click next
 	Now we are on the "output" tab, the filter in the window on the right
 	indicates that 27,967 passed the filter.  (there is no filter)
 	Now, on this output page, change the pull-down menu item from
 	its default of "features" to read "structures"
 	All the check-boxes now change.
 	Mark the check box GTF under output format
 	Under Gene Ensemble Attributes,
 	Unselect Biotype
 	Select
 	Ensembl Gene ID
 	Ensembl Transcript ID
 	External Gene ID
 
 	gzip compression
 	and give it a filename: ensGeneMm8
 	it will add the .gff.gz suffix
 	press "export"
 
 #	The random coordinates are given in contig
 #	coordinates, need to lift them to chroms, create a lift file:
 	echo << '_EOF_' > mkRandomNTLift.sh
 #!/bin/sh
 
 grep random /cluster/data/mm8/chrom.sizes | while read R
 do
         chr=`echo $R | awk '{print $1}'`
         size=`echo $R | awk '{print $2}'`
         hgsql -N -e "select * from ctgPos where chrom=\"$chr\";" mm8 | \
 awk '
 BEGIN {size="'$size'"}
 {
         printf "%s\t%s\t%s\t%s\t%s\n", $4, $1, $2, $3, size
 }
 '
 done
 '_EOF_'
     # << happy emacs
     chmod +x ./mkRandomNTLift.sh
     ./mkRandomNTLift.sh > randomNT.lft
 
     # Add "chr" to front of each line (that is a normal chrom number)
     #	 in the gene data gtf file to make 
     # it compatible with ldHgGene and convert the chrMT name, and lift
     # the random coordinates
 
     zcat ensGeneMm8.gff.gz \
 	| sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
 	| liftUp ensGene.gtf randomNT.lft carry stdin
     ldHgGene mm8 ensGene ensGene.gtf
 # Read 34831 transcripts in 597575 lines in 1 files
 #   34831 groups 34 seqs 1 sources 4 feature types
 # 34831 gene predictions
 
     featureBits mm8 ensGene
     #	56159487 bases of 2567283971 (2.188%) in intersection
     featureBits mm7 ensGene
     #	57484684 bases of 2583394090 (2.225%) in intersection
     featureBits mm6 ensGene
     #	54791625 bases of 2597150411 (2.110%) in intersection
 
     # Load ensGtp table.
     # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
     # hgKnownToSuper.  
 
     # Get the Ensembl BioMart at http://www.ensembl.org/Multi/martview
     # Choose Ensembl 39 and Mus musculus, click next
     # Follow this sequence through the pages:
     # 1) No filters in the filter section, click next go to Output
     # 2) Select "Structures".
     # 3) select Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID. 
     # 4) select "Text, tab separated" gzip and name the output file as "ensGtp"
     # 5) download the output file "ensGtp.tsv.gz"
     #       the tsv.gz is added automatically to the ensGtp
 
     #	Something is unusual in this download.  The lines are duplicated
     #	about 8 times more than necessary
     zcat ensGtp.tsv.gz | wc -l
     #	284554
     zcat ensGtp.tsv.gz | sort -u | wc -l
     gunzip ensGtp.tsv.gz
     #	34832
 
     hgsql mm8 < ~/kent/src/hg/lib/ensGtp.sql
     #	The 'tail -n +2' skips the first line with is just column
     #	heading labels.  The sort -u will eliminate the duplicate lines:
     zcat ensGtp.tsv.gz | tail -n +2 | sort -u \
     	| hgsql mm8 -e \
 	'load data local infile "/dev/stdin" into table ensGtp;' 
     hgsql -e "select count(*) from ensGtp;" mm8
     #	34831
     #	properly, one less than the count above
 
     #	clean up
     gzip ensGene.gtf
     rm genePred.tab
 
     #	Now, an experiment to determine if the Ensembl peptide sequences
     #	are the same thing we get here upon translation of the CDS coding
     #	sequence from the genome
     mkdir /cluster/data/mm8/bed/ensGene/testPeptides
     cd /cluster/data/mm8/bed/ensGene/testPeptides
     getRnaPred -cdsOnly mm8 ensGene all stdout | gzip > all.cdsOnly.gz
     #	Obtaining protein sequence from EnsMart
     #	Select "sequences" from the pull-down on the output page
     #	check Peptide in the "Sequences" selection area
     #	and "Ensembl Transcript ID (versioned) in the Transcript
     #	Attributes area
     #	Text,Fasta output, gzip, file name: ensPepMm8
     #	becomes ensPepMm8.fasta.gz
 
     #	A special faToTab.pl script to allow an exclude list, first need
     #	to obtain the exclude list from the ensembl set:
     zcat ensPepMm8.fasta.gz \
 	| ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
 	    | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
     #	extract the exclude list from that
     grep "Sequence unavailable" ensPepMm8.fa.tab \
 	| awk '{print $1}' > excludeList.txt
     #	now filter via that exclude list, remove the final '*' character
     #	from their protein sequence and sort by name
     zcat ensPepMm8.fasta.gz \
 	| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
 	    | sed -e "/^$/d; s/\*$//" | sort > ensPepMm8.fa.tab
     #	and then our peptides, same filter, remove the final 'Z' character
     #	from this protein sequence (the stop codon):
     zcat all.cdsOnly.gz | faTrans stdin stdout \
 	| ~/kent/src/utils/faToTab/faToTab.pl excludeList.txt /dev/stdin \
 	    | sed -e "/^$/d; s/Z$//" | sort > all.fa.tab
     #	do we have the same lists:
     awk '{print $1}' ensPepMm8.fa.tab > ensList		
     awk '{print $1}' all.fa.tab > ucscList
     diff ensList ucscList
     #	no differences in the name list, numbering:
     wc -l ensList ucscList
     #	31302 ensList
     #	31302 ucscList
     #	How many proteins different:
     diff ensPepMm8.fa.tab all.fa.tab | grep "^>" | awk '{print $2}' | wc -l
     #	37
     #	Taking a look at that difference, it is difficult to see the
     #	individual differences, some are single amino acid
     #	differences, others are more radically different:
     diff ensPepMm8.fa.tab all.fa.tab | less
 
     #	Conclusion, the 37 differences out of 31,302 are not worth the
     #	trouble to load up the entire Ensembl peptide table 
     # Add Ensembl peptide table - requested by a user (hartera, 2006-07-11)
     ssh hgwdev
     cd /cluster/data/mm8/bed/ensGene
 cat << EOF > ensPep.sql
 CREATE TABLE ensPep (
 name varchar(255) not null,     # Name of gene - same as in genePred
 seq longblob not null,     # Peptide sequence
 #Indices
 PRIMARY KEY(name(64))
 );
 EOF
     cp ./testPeptides/ensPepMm8.fa.tab.gz .
     gunzip ensPepMm8.fa.tab.gz
     hgLoadSqlTab mm8 ensPep ensPep.sql ensPepMm8.fa.tab -warn
 
 ###########################################################################
 ## MAKE SUPERFAMILY TRACK (DONE, 6/22/06, Fan)
 
 # If mm8.superfamily already exists, drop it.
    cd /cluster/data/mm8/bed
    mkdir /cluster/data/mm8/bed/sf.20060622
    ln -s sf.20060622 sf
    cd sf
    hgSuperfam mm8 superfam060619 > sf.log
 
 # It is normal that many proteins do not have corresponding Superfamily entries.
 
 # If mm8.sfDescription exists, drop it.
 
    hgsql mm8 < ~/src/hg/lib/sfDescription.sql
    hgsql mm8 -e 'LOAD DATA local INFILE "sfDescription.tab"  into table mm8.sfDescription;'
 
 # Finally, load the superfamily table.
 
    hgLoadBed mm8 superfamily superfamily.tab -tab
 
 # Create knownToEnsembl table
    hgMapToGene mm8 ensGene knownGene knownToEnsembl
 
 # Create knownToSuperfamily table
 # Note hs is changed into ht for this Superfamily release.
 
    cat /cluster/data/superfamily/060619/ass_18-Jun-2006.tab \
    | hgKnownToSuper mm8 mm stdin
 # 26547 records output
 
 
 ###########################################################################
 # dbSNP BUILD 126 (Heather, August 2006)
 
 # Set up directory structure
 ssh kkstore02
 cd /cluster/data/dbSNP/126/mouse
 mkdir mm8
 cd mm8
 mkdir data
 mkdir schema
 mkdir rs_fasta
 
 # Get data from NCBI (anonymous FTP)
 cd /cluster/data/dbSNP/126/mouse/mm8/data
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/mouse_10090/database/organism_data
 # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
 get b126_SNPContigLoc_36_1.bcp.gz
 # ContigLocusId has function
 get b126_SNPContigLocusId_36_1.bcp.gz
 get b126_ContigInfo_36_1.bcp.gz
 # MapInfo has alignment weights
 get b126_SNPMapInfo_36_1.bcp.gz
 # SNP has univar_id, validation status and heterozygosity
 get SNP.bcp.gz
 
 # Get schema from NCBI
 cd /cluster/data/dbSNP/126/mouse/mm8/schema
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/mouse_10090/database/organism_schema
 get mouse_10090_table.sql.gz
 
 # Get fasta files from NCBI
 # using headers of fasta files for molType
 cd /cluster/data/dbSNP/126/mouse/rs_fasta
 ftp ftp.ncbi.nih.gov
 cd snp/organisms/mouse_10090/rs_fasta
 prompt
 mget *.gz
 
 # add rs_fasta to seq/extFile
 # 2 edits first: strip header to just rsId, and remove duplicates
 # work on /cluster/store12 (kkstore05) which has more disk space
 cp rs_ch*.fas.gz /cluster/store12/snp/126/mouse/rs_fasta
 ssh kkstore05
 cd /cluster/store12/snp/126/mouse/rs_fasta
 # concat into rsAll.fas
 cat << '_EOF_' > concat.csh
 #!/bin/csh -ef
 rm -f rsAll.fas
 foreach file (rs_ch*.fas)
     echo $file
     zcat $file >> rsAll.fas
 end
 '_EOF_' 
 # snpCleanSeq strips the header and skips duplicates
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCleanSeq rsAll.fas snp.fa
 rm rsAll.fas
 # load on hgwdev
 ssh hgwdev
 mkdir /gbdb/mm8/snp
 ln -s /cluster/store12/snp/126/mouse/rs_fasta/snp.fa /gbdb/mm8/snp/snp.fa
 cd /cluster/store12/snp/126/mouse/rs_fasta
 hgLoadSeq mm8 /gbdb/mm8/snp/snp.fa
 
 # look up id in extFile
 # move into separate table
 hgsql mm8 < snpSeq.sql
 hgsql -e 'insert into snpSeq select acc, file_offset from seq where extFile = 9642470' mm8
 hgsql -e 'delete from seq where extFile = 9642470' mm8
 hgsql -e 'alter table snpSeq add index acc (acc)' mm8
 
 # clean up after hgLoadSeq
 rm seq.tab
 
 # Simplify names of data files
 cd /cluster/data/dbSNP/126/mouse/mm8/data
 mv b126_ContigInfo_36_1.bcp.gz ContigInfo.gz
 mv b126_SNPContigLoc_36_1.bcp.gz ContigLoc.gz
 mv b126_SNPContigLocusId_36_1.bcp.gz ContigLocusId.gz
 mv b126_SNPMapInfo_36_1.bcp.gz MapInfo.gz
 mv SNP.bcp.gz SNP.gz
 ls -1 *.gz > filelist
 
 # edit table descriptions
 cd /cluster/data/dbSNP/126/mouse/mm8/schema
 # get CREATE statements from mouse_10090_table.sql for our 5 tables
 # store in table.tmp
 # convert and rename tables
 sed -f 'mssqlToMysql.sed' table.tmp > table2.tmp
 rm table.tmp
 sed -f 'tableRename.sed' table2.tmp > table.sql
 rm table2.tmp
 
 # Get updated UniVariation table
 cd /cluster/data/dbSNP/126/shared
 ftp ftp.ncbi.nih.gov
 cd snp/database/shared_data
 get UniVariation.bcp.gz
 cd ../shared_schema
 get dbSNP_main_table.sql.gz
 # get UniVariation CREATE statement from dbSNP_main_table.sql
 # use mssqlToMysql.sed to convert 
 
 # get header lines from rs_fasta
 cd /cluster/data/dbSNP/126/mouse/mm8/rs_fasta
 /bin/csh gnl.csh
 
 # load on kkr5u00
 ssh kkr5u00
 hgsql -e mysql 'create database mm8snp126' 
 cd /cluster/data/dbSNP/126/mouse/mm8/schema
 hgsql mm8snp126 < table.sql
 cd ../data
 /bin/csh load.csh
 
 # note rowcount
 # ContigLoc     23811983 
 # SNP           10837184 
 # MapInfo       23570302 
 # ContigLocusId 10317095 
 
 cd /cluster/data/dbSNP/126/shared
 hgsql mm8snp126 < UniVariation.sql
 zcat UniVariation.bcp.gz | hgsql -e 'load data local infile "/dev/stdin" into table UniVariation' mm8snp126
 
 # create working /scratch dir
 cd /scratch/snp/126
 mkdir mouse
 cd mouse
 
 # get mm8 ctgPos, load into mm8snp126, compare contig list between ctgPos and ContigInfo
 # No issues in non-random
 # No PAR issues 
 
 # get gnl files
 cp /cluster/data/dbSNP/126/mouse/mm8/rs_fasta/*.gnl .
 
 # examine ContigInfo for group_term and edit pipeline.csh
 # use "ref_strain" 
 
 # filter ContigLoc into ContigLocFilter
 # this lifts from contig coords to chrom coords
 # phys_pos_from is used to check coords for non-random chroms
 # errors reported to stdout
 # this gets rid of alternate assemblies (using ContigInfo)
 # this also gets rid of poor quality alignments (weight == 10 || weight == 0 in MapInfo)
 # assumes all contigs are positively oriented; will abort if not true
 # Note for mouse we also screen on assembly = "C57BL/6J" in MapInfo
 
 mysql> desc ContigLocFilter;
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromName     | varchar(32) | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | start         | int(11)     | NO   |     |         |       |
 #  | end           | int(11)     | YES  |     | NULL    |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
  
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocFilter mm8snp126 ref_strain C57BL/6J
 # note rowcount
 # ContigLocFilter  7923033
 # how many are positive strand? hopefully 90%
 mysql> select count(*) from ContigLocFilter where orientation = 0;
 # 7779413
 # note count by loc_type
 mysql> select count(*), loc_type from ContigLocFilter group by loc_type;
 # +----------+----------+
 # | count(*) | loc_type |
 # +----------+----------+
 # |     2144 |        1 |
 # |  7903966 |        2 |
 # |    13105 |        3 |
 # |     1052 |        4 |
 # |      523 |        5 |
 # |     2243 |        6 |
 # +----------+----------+
 
 
 # filter ContigLocusId into ContigLocusIdFilter
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdFilter mm8snp126 ref_strain
 # note rowcount 
 # ContigLocusIdFilter  3484757
 
 # condense ContigLocusIdFilter into ContigLocusIdCondense (one SNP can have multiple functions)
 # assumes SNPs are in numerical order; will errAbort if not true
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpContigLocusIdCondense mm8snp126 
 # note rowcount; expect about 50% (ascertainment bias for SNPs within genes)
 # ContigLocusIdCondense 2789998 
 # could delete ContigLocusIdFilter table here
 
 # create chrN_snpFasta tables from *.gnl files
 # we are just using molType, but also storing class and observed
 # need chromInfo for this
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoadFasta mm8snp126 
 
 # (could start using pipeline.csh here)
 # (pipeline.csh takes about 35 minutes to run)
 
 # split ContigLocFilter by chrom 
 # create the first chrN_snpTmp
 # we will reuse this table name, adding/changing columns as we go
 # at this point chrN_snpTmp will have the same description as ContigLocFilter
 # this opens a file handle for every chrom, so will not scale to scaffold-based assemblies
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSplitByChrom mm8snp126 ref_strain
 
 # adjust coords using loc_type
 # possible errors logged to snpLocType.error:
 # Unknown locType 
 # Between with end != start + 1 
 # Between with allele != '-' 
 # Exact with end != start 
 # Range with end < start 
 
 # possible exceptions logged to snpLocType.exceptions:
 # RefAlleleWrongSize
 
 # This run no errors, no exceptions
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +---------------+-------------+------+-----+---------+-------+
 #  | Field         | Type        | Null | Key | Default | Extra |
 #  +---------------+-------------+------+-----+---------+-------+
 #  | snp_id        | int(11)     | NO   |     |         |       |
 #  | ctg_id        | int(11)     | NO   |     |         |       |
 #  | chromStart    | int(11)     | NO   |     |         |       |
 #  | chromEnd      | int(11)     | NO   |     |         |       |
 #  | loc_type      | tinyint(4)  | NO   |     |         |       |
 #  | orientation   | tinyint(4)  | NO   |     |         |       |
 #  | allele        | blob        | YES  |     | NULL    |       |
 #  +---------------+-------------+------+-----+---------+-------+
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpLoctype mm8snp126 ref_strain
 
 # expand allele as necessary
 # report syntax errors to snpExpandAllele.errors
 # possible exceptions logged to snpExpandAllele.exceptions:
 # RefAlleleWrongSize
 # This run no errors, no exceptions
 # 200? alleles expanded
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpExpandAllele mm8snp126 ref_strain
 
 # the next few steps prepare for working in UCSC space
 
 # sort by position
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSort mm8snp126 ref_strain
 
 # rename MT --> M (pipeline.csh takes care of this)
 hgsql -e "rename table chrMT_snpTmp to chrM_snpTmp" mm8snp126
 
 # get mm8 nib files
 # get mm8 chromInfo, load into mm8snp126 with editted path
 # lookup reference allele in nibs
 # keep reverse complement to use in error checking (snpCheckAlleles)
 # check here for SNPs larger than 1024
 # errAbort if detected
 # check for coords that are too large, log to snpRefUCSC.error and skip
 # This run no errors
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpRefUCSC mm8snp126
 
 # morph chrN_snpTmp 
 
 mysql> desc chr1_snpTmp;
 
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | Field              | Type        | Null | Key | Default | Extra |
 #  +--------------------+-------------+------+-----+---------+-------+
 #  | snp_id             | int(11)     | NO   |     |         |       |
 #  | ctg_id             | int(11)     | NO   |     |         |       |
 #  | chromStart         | int(11)     | NO   |     |         |       |
 #  | chromEnd           | int(11)     | NO   |     |         |       |
 #  | loc_type           | tinyint(4)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)  | NO   |     |         |       |
 #  | allele             | blob        | YES  |     | NULL    |       |
 #  | refUCSC            | blob        | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob        | YES  |     | NULL    |       |
 #  +--------------------+-------------+------+-----+---------+-------+
 
 # compare allele from dbSNP to refUCSC
 # locType between is excluded from this check
 # log exceptions to snpCheckAllele.exceptions
 # if SNP is positive strand, expect allele == refUCSC
 # log RefAlleleMismatch if not
 # if SNP is negative strand, if not allele == refUCSC, then check for allele == refUCSCReverseComp
 # If allele == refUCSCRevComp, log RefAlleleNotRevComp
 # If allele doesn't match either of refUCSC or refUCSCReverseComp, log RefAlleleMismatch
 # This run we got:
 # 0 RefAlleleMismatch
 # 9621   RefAlleleNotRevComp
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckAlleles mm8snp126
 
 # add class and observed using univar_id from SNP table
 # to get class (subsnp_class) and observed (var_str) from UniVariation
 # log errors to snpClassAndObserved.errors
 # errors detected: 
 # class = 0 in UniVariation
 # class > 8 in UniVariation
 # univar_id = 0 in SNP
 # no row in SNP for snp_id in chrN_snpTmp
 # This run we got:
 # 3 class = 0 in UniVariation
 # 0 class > 8 in UniVariation
 # 2890606 univar_id = 0 in SNP (strange, but okay)
 # 0 no row in SNP for snp_id in chrN_snpTmp 
 # dbSNP has class = 'in-del'
 # we promote this to 'deletion' for locType 1&2 and to 'insertion' for locType 3
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpClassAndObserved mm8snp126
 
 # morph chrN_snpTmp
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | Field              | Type          | Null | Key | Default | Extra |
 #  +--------------------+---------------+------+-----+---------+-------+
 #  | snp_id             | int(11)       | NO   |     |         |       |
 #  | chromStart         | int(11)       | NO   |     |         |       |
 #  | chromEnd           | int(11)       | NO   |     |         |       |
 #  | loc_type           | tinyint(4)    | NO   |     |         |       |
 #  | class              | varchar(255)  | NO   |     |         |       |
 #  | orientation        | tinyint(4)    | NO   |     |         |       |
 #  | allele             | blob          | YES  |     | NULL    |       |
 #  | refUCSC            | blob          | YES  |     | NULL    |       |
 #  | refUCSCReverseComp | blob          | YES  |     | NULL    |       |
 #  | observed           | blob          | YES  |     | NULL    |       |
 #  +--------------------+---------------+------+-----+---------+-------+
 
 # generate exceptions for class and observed
 
 # SingleClassBetweenLocType
 # SingleClassRangeLocType
 # NamedClassWrongLocType
 
 # ObservedWrongFormat
 # ObservedWrongSize 
 # ObservedMismatch 
 
 # RangeSubstitutionLocTypeExactMatch
 
 # SingleClassTriAllelic
 # SingleClassQuadAllelic
 
 # This will also detect IUPAC symbols in allele
 
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpCheckClassAndObserved mm8snp126
 
 # add function
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFunction mm8snp126
 
 # add validation status and heterozygosity
 # log error if validation status > 31 or missing
 # no errors this run
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpSNP mm8snp126
 
 # add molType
 # errors detected: missing or duplicate molType
 # 57709 duplicates
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMoltype mm8snp126
 
 # generate chrN_snp126 and snp126Exceptions tables
 cp snpCheckAlleles.exceptions snpCheckAlleles.tab
 cp snpCheckClassAndObserved.exceptions snpCheckClassAndObserved.tab
 cp snpExpandAllele.exceptions snpExpandAllele.tab
 cp snpLocType.exceptions snpLocType.tab
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpFinalTable mm8snp126 126
 
 # concat into snp126.tab
 # cat chr*_snp126.tab >> snp126.tab
 /bin/sh concat.sh
 
 # check for multiple alignments
 /cluster/home/heather/kent/src/hg/snp/snpLoad/snpMultiple mm8snp126
 mysql> load data local infile 'snpMultiple.tab' into table snp126Exceptions;
 
 # load on hgwdev
 cp snp126.tab /cluster/home/heather/transfer/snp
 hgsql mm8snp126 -e 'select * from snp126Exceptions' > /cluster/home/heather/transfer/snp/snp126Exceptions.tab
 ssh hgwdev
 mysql> load data local infile 'snp126.tab' into table snp126; 
 mysql> load data local infile 'snp126Exceptions.tab' into table snp126Exceptions; 
 
 # create indexes
 mysql> alter table snp126 add index name (name);
 mysql> alter table snp126 add index chrom (chrom, bin);
 mysql> alter table snp126Exceptions add index name(name);
 
 # create snp126ExceptionDesc table
 cd /cluster/data/dbSNP
 hgsql mm8 < snp126ExceptionDesc.sql
 # add counts to exception.human.126, can start with exception.template
 hgsql -e 'select count(*), exception from snp126Exceptions group by exception' mm8
 mysql> load data local infile 'exception.mouse.126' into table snp126ExceptionDesc;
 
 mysql> select count(*), exception from snp126Exceptions group by exception;
 +----------+---------------------------+
 | count(*) | exception                 |
 +----------+---------------------------+
 |    97271 | MultipleAlignments        |
 |     1600 | ObservedMismatch          |
 |       27 | ObservedWrongFormat       |
 |      272 | ObservedWrongSize         |
 |     9621 | RefAlleleNotRevComp       |
 |    11169 | SingleClassBetweenLocType |
 |      346 | SingleClassQuadAllelic    |
 |     5023 | SingleClassRangeLocType   |
 |     3905 | SingleClassTriAllelic     |
 +----------+---------------------------+
 
 ####################################################################
 ##  redoing STS markers track to get them more correct
 ##		(DONE - 2006-09-15 - Hiram)
     #	Went into the updateBed.pl script, reworked it, made it safer,
     #	debugged a lot of things and placed it into the source tree.
 
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/STSmarkers.2006-08-29
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
     # with that fixed script, create a new stsInfoMouse.bed file:
     #	Update the m m 7 directory name here to m m 8
     #	for the next build of m m 9,  ...etc... and so forth
     time ~/kent/src/hg/stsMarkers/updateBed.pl \
         /cluster/data/mm7/bed/STSmarkers/stsInfoMouse.bed \
         ../STSmarkers/downloads/MRK_Dump2.rpt \
 	../STSmarkers/downloads/PRB_PrimerSeq.rpt \
         ../STSmarkers/downloads/MRK_Sequence.rpt \
 	../STSmarkers/downloads/UniSTS_mouse.alias \
         ../STSmarkers/downloads/UniSTS_mouse.sts \
         -g ../STSmarkers/downloads/10090.WI-Genetic.txt \
         -r ../STSmarkers/downloads/10090.WI_MRC_RH.txt \
         -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile
 
     ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
         | sed -e "s/\t*$//" > mm8.stsInfoMouse.bed
 
     # copy the stsInfoMouse.bed file from working dir to the marker
     #	info storage fold.  added 2 new steps by Yontao	
     #	be wary of the archive name here, check the directory and get
     #	the name right here.
     mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
     cp -p mm8.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed
 
     # comparing to previous, numbers increase slightly each time
     wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
 	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
     #	60631 /cluster/store5/mouseMarker/stsInfoMouse.bed
     #	60440 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.firstTime
     #	59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
     #	58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
     #	58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
 
     # and from that, create new primer fa, epcr, etc:
     time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
 	mm8.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
     # the mouseC.fa file will be empty, should be more than last time
     wc -l mouse?.* ../STSmarkers/mouse?.*
     #	     0 mouseC.fa
     #	308384 mouseP.fa
     #	 34666 mouseP.info
     #	     0 ../STSmarkers/mouseC.fa
     #	305991 ../STSmarkers/mouseP.fa
     #	 34475 ../STSmarkers/mouseP.info
 
     #	the equivalent Mm7 files:
     #      0       0       0 mouseC.fa
     # 300968  300914 6798466 mouseP.fa
     #  33838  169275 2153113 mouseP.info
     # 334806  470189 8951579 total
     #	the equivalent Mm6 files:
     #	     0       0       0 mouseC.fa
     #	293305  293251 6624638 mouseP.fa
     #	 32890  164528 2087271 mouseP.info
     #	326195  457779 8711909 total
     #	the equivalent Mm5 files:
     #	     0       0       0 mouseC.fa
     #	286740  286686 6474893 mouseP.fa
     #	 32232  161234 2044810 mouseP.info
     #	318972  447920 8519703 total
 
     #	copy the primers over to some filesystem close to the klusters
     #	and split them up to have a small number of sequences in one file
     
 
     mkdir /cluster/bluearc/mm8/stsMarkers.2006-08-29
     cp -p mouseP.fa /cluster/bluearc/mm8/stsMarkers.2006-08-29
     cd /cluster/bluearc/mm8/stsMarkers.2006-08-29
     cp -p /cluster/data/mm8/11.ooc .
     mkdir split
     #	356 files for 34,666 sequences, == about 97 sequences per file
     faSplit sequence mouseP.fa 400 split/mm_
 
     # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
     #	This process could convert to a modern version of blat with the
     #	filters as described, for example, in the STS markers build in Hg18
 
     #  CLUSTER RUN FOR THE STS PRIMERS
     ssh kk
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
     mkdir primer
     mkdir ePCR
     cd primer
     mkdir out
 
     #	interestingly, this blat2.2 binary did not function correctly
     #	when given nib files.  It has only about 1/4th of the number of
     #	alignments as it gets when it used fa files for the target
     #	sequence.
 
     ls -1S /cluster/bluearc/mm8/stsMarkers.2006-08-29/split > primers.list
     #	re-using chrom sequences from first time
     ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
 
     cat << '_EOF_' > runBlat2.csh
 #!/bin/csh -fe
 set primer = /cluster/bluearc/mm8/stsMarkers.2006-08-29/split/$1
 set fa = /cluster/bluearc/mm8/stsMarkers/chroms/$2
 set ooc = /cluster/bluearc/mm8/stsMarkers.2006-08-29/11.ooc
 set root2 = $2:r
 mkdir -p out/${root2}
 set out = $3
 
 /cluster/bin/i386/blat.2 ${fa} ${primer} -ooc=${ooc} \
         -minMatch=1 -minScore=0 -minIdentity=80 -oneOff ${out}
 '_EOF_'
     #	<< happy emacs
     chmod +x runBlat2.csh
 
     cat << '_EOF_' > template
 #LOOP
 ./runBlat2.csh $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
 #ENDLOOP
 '_EOF_'
     #	<< happy emacs
 
     gensub2 primers.list chr.list template jobList
     para create jobList
     p80ara try ... check ... push ... etc ...
 # Completed: 12104 of 12104 jobs
 # CPU time in finished jobs:    1078733s   17978.89m   299.65h   12.49d  0.034 y
 # IO & Wait Time:              13537140s  225618.99m  3760.32h  156.68d  0.429 y
 # Average job time:                1208s      20.13m     0.34h    0.01d
 # Longest finished job:           11831s     197.18m     3.29h    0.14d
 # Submission to last job:         20458s     340.97m     5.68h    0.24d
 
     # on the file server
     ssh kkstore04
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer
     time pslSort dirs primers.raw.psl temp out/chr*
     #	real    3m30.758s
     #	-rw-rw-r--   1 588001891 Sep 15 10:02 primers.raw.psl
 
     #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
     #	should not be more than 100 bases different.
     #	This filters out about 948,260 alignments, or
     #	%17.4 = 100.0 * 948260 / 5462936
     time pslSort dirs stdout temp out/chr* | awk -F"\t" '
 { if (((($13 - $12) - ($17 - $16)) > -100) &&
 	((($13 - $12) - ($17 - $16)) < 100)) {print}
 }
 ' > primers.psl.100
 
     rmdir temp
 
     wc -l *.100 *.psl
     #	5462936 primers.raw.psl
     #	4514676 primers.psl.100
     #    948260 difference
 
     # a rough comparison with previous results:
     wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100
     #	4500528 /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.100
 
     # another kluster run for the ePCR
     ssh pk
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
     ls -1S /cluster/bluearc/mm8/stsMarkers/chroms > chr.list
 
     #	pick up e-PCR source from
     #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
     #	version 2.3.1 11 Feb 2005
     #	Had to add the following to both re-PCR_main.cpp and
     #	e-PCR_main.cpp to get them to compile on kolossus:
 // max and min Copied from /usr/include/mysql/my_global.h
 #define max(a, b)       ((a) >? (b))
 #define min(a, b)       ((a) <? (b))
 
     mkdir out
     cat << '_EOF_' > runPCR
 #!/bin/csh -fe
 /cluster/bin/x86_64/e-PCR \
 	/cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.info \
 	/cluster/bluearc/mm8/stsMarkers/chroms/$1 N=1 M=50 W=5 > $2
 '_EOF_'
     # << happy emacs
     chmod +x runPCR
 
     cat << '_EOF_' > template
 #LOOP
 ./runPCR $(path1) {check out line+ out/$(num1).epcr}
 #ENDLOOP
 '_EOF_'
     # << the mouseP.info was created above
     gensub2 chr.list single template jobList
     para create jobList
     para try
     para check
     para push
     ... etc ...
     #	There is a single job that produces no output:
     ./runPCR chrX_random.fa out/30.epcr
     #	WARNING: 96 STSs have primer shorter than W
     #	WARNING: 21 STSs have ambiguities within W of 3' end
     #	Not sure what's up with that
 # Completed: 33 of 34 jobs
 # Crashed: 1 jobs
 # CPU time in finished jobs:      64904s    1081.73m    18.03h    0.75d  0.002 y
 # IO & Wait Time:                  1860s      31.00m     0.52h    0.02d  0.000 y
 # Average job time:                2023s      33.72m     0.56h    0.02d
 # Longest finished job:            4861s      81.02m     1.35h    0.06d
 # Submission to last job:          4862s      81.03m     1.35h    0.06d
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR
     # all those results become all.epcr
     cat out/*.epcr > all.epcr
 
     # comparing to previous results:
     wc -l all.epcr
     #	58162 all.epcr
     wc -l /cluster/data/mm8/bed/STSmarkers/ePCR/all.epcr
     #	58088 all.epcr
 
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer
 
     ~/kent/src/hg/stsMarkers/filterSTSPrimers \
     -mouse ../mm8.stsInfoMouse.bed primers.psl.100 \
         ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat
 
     #  The output should show an increasing count:
     #	Reading name info
     #	Reading primer info
     #	Processing file
     #	100000
     #	200000
     #	300000
     #	...
     #	4500000
     #	Determining ePCR not found from ePCR results
     #	Out of 25749 ePCR alignments examined, not found: 520
     #
     wc -l primers.psl.filter.blat
     #	34043 primers.psl.filter.blat
     wc -l /cluster/data/mm8/bed/STSmarkers/primer/primers.psl.filter.blat
     #	34026 primers.psl.filter.blat
 
     # create file accession_info.rdb
     touch empty_sequence.inf
     ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
 	/cluster/data/mm8 empty_sequence.inf
     #	20502 processed
     mv accession_info.rdb accession_info.rdb.tmp
     ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
 	< accession_info.rdb.tmp > accession_info.rdb
     #	The -x prints the debug statement:
     #	sort arg:  -t"  " +0 -1 +1 -2g +2 -3g
     rm accession_info.rdb.tmp
 
     # comparing results to previous
     #	Continuing the trend that began with Mm7, the numbers in
     #	accession_info.rdb continue to decrease.  Even Mm8 has much less
     #	fragments than did mm7:
     #	e.g.:
     [hiram@kkstore04 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
     #	21910 total
     [hiram@kkstore04 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
     #	70125 total
     [hiram@kkstore04 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
     #	170812 total
 
     wc -l accession_info.rdb
     #	20385 accession_info.rdb
     wc -l ../../STSmarkers/primer/accession_info.rdb
     #	20385 ../../STSmarkers/primer/accession_info.rdb
 
     # creates epcr.not.found.nomatch and epcr.not.found.psl
     ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
 	epcr.not.found ../mouseP.info \
 	accession_info.rdb /cluster/data/mm8 2> dbg.epcrToPsl
     #	the dbg.epcrToPsl has a number of lines complaining about bad
     #	primers in ../mouseP.info - and indeed they are bad primers,
     #	they do not have a second primer.
 
     # Comparing results to previous:
     wc -l epcr*
     #	520 epcr.not.found
     #	  0 epcr.not.found.nomatch
     #	520 epcr.not.found.psl
     wc -l ../../STSmarkers/primer/epcr*
     #	501 ../../STSmarkers/primer/epcr.not.found
     #	  0 ../../STSmarkers/primer/epcr.not.found.nomatch
     #	501 ../../STSmarkers/primer/epcr.not.found.psl
 
     # Mm7 wc epcr*
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
     #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
     #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
     #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
     #	1106 total
 
     # Mm6 wc epcr*
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/epcr*
     #	 472 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found
     #	  63 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.nomatch
     #	 404 /cluster/data/mm6/bed/STSmarkers/primer/epcr.not.found.psl
     #	 158 /cluster/data/mm6/bed/STSmarkers/primer/epcrToPsl
     #	1097 total
 
     cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
     wc -l primers.psl.filter
     #	34563 primers.psl.filter
     wc -l ../../STSmarkers/primer/primers.psl.filter
     #	34527 primers.psl.filter
 
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
     #	34460 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter
 
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
     #	33532 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter
 
     wc -l /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
     # 33691 /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted
 
     # create primers.psl.filter.lifted.initial
     #	The PATH setting allows extractPslInfo to find other programs that it
     #	is going to use.
     PATH=~/kent/src/hg/stsMarkers:$PATH \
 	~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter
 
     wc -l *.initial
     #	34545 primers.psl.filter.initial
     wc -l ../../STSmarkers/primer/*.initial
     #	34513 ../../STSmarkers/primer/primers.psl.filter.initial
     wc -l /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
     #	34443 /cluster/data/mm7/bed/STSmarkers/primer/primers.psl.filter.initial
     wc -l /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
     #	33514 /cluster/data/mm6/bed/STSmarkers/primer/primers.psl.filter.initial
     wc -l \
        /cluster/data/mm5/bed/STSmarkers/primer/primers.psl.filter.lifted.initial
     # 33689 
 
     # create primers.psl.filter.lifted.initial.acc
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/findAccession -agp \
 	-mouse primers.psl.filter.initial /cluster/data/mm8
     wc -l primers.psl.filter.initial.acc
     #	34545 primers.psl.filter.initial.acc
     wc -l ../../STSmarkers/primer/primers.psl.filter.initial.acc
     #	34513 primers.psl.filter.initial.acc
 
 
     # this needs to be -rat as that specifies how to scan the
     # stsInfoMouse.bed file and it does not work if you use -mouse
     # it is not clear what -mouse would mean to this script, some other file
     # format perhaps from the stsInfoMouse.bed format.
     ~/kent/src/hg/stsMarkers/getStsId -rat \
 	../mm8.stsInfoMouse.bed  primers.psl.filter.initial.acc \
 	| sort -k4,4n > primers.final
     wc -l primers.final
     #	34545 primers.final
     wc -l ../STSmarkers/primer/primers.final
     #	34513 primers.final
 
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
     # stsMarkers.final is empty for mouse
     touch stsMarkers.final dummy
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
 	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
     wc -l stsMarkers_pos.rdb
     #	33048 stsMarkers_pos.rdb
     wc -l ../STSmarkers/stsMarkers_pos.rdb
     #	33075 stsMarkers_pos.rdb
 
     PATH=~/kent/src/hg/stsMarkers:$PATH \
     ~/kent/src/hg/stsMarkers/createStsBed \
 	mm8.stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
 	| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
     #	The sed removes unneeded blanks
     #	verify score profile remains similar
     awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
     #	  546 500
     #	 1650 750
     #	27705 1000
     awk -F'\t' '{print $5}' ../STSmarkers/stsMapMouse.bed | sort -n | uniq -c
     #	  546 500
     #	 1648 750
     #	27692 1000
 
     wc -l stsMapMouse.bed
     #	29901  stsMapMouse.bed
     wc -l ../STSmarkers/stsMapMouse.bed
     #	29888  stsMapMouse.bed
 
     #  loading STS markers tables
     ssh hgwdev
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
     ~/kent/src/hg/stsMarkers/ucscAlias.pl \
 	mm8.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
     #	this does leave messages in ucscStsAlias.warnings but they seem
     #	to be very similar to Mm6 with just a few new ones
      
     wc -l ucscStsAlias.tab
     #	146767  ucscStsAlias.tab
     wc -l ../STSmarkers/ucscStsAlias.tab
     #	146064  ucscStsAlias.tab
 
     #	After extensive comparison with the currently existing STS markers, it
     #	appears that this new set only has a couple of new ones, and a couple
     #	of ones have been dropped.  It seems that the primary correction has
     #	been to the marker positions.
 
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/STSmarkers.2006-08-29
     #	Saving the existing tables for archival purposes
     hgsql -e "alter table stsInfoMouseNew rename as stsInfoMouseNewFeb2006;" mm8
     hgsql -e "alter table stsAlias rename as stsAliasFeb2006;" mm8
     hgsql -e "alter table all_sts_primer rename as all_sts_primerFeb2006;" mm8
     hgsql -e "alter table stsMapMouseNew rename as stsMapMouseNewFeb2006;" mm8
 
     hgsql mm8 < ~/kent/src/hg/lib/stsAlias.sql
     hgsql -e \
 	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm8
     hgsql mm8 < ~/kent/src/hg/lib/stsMapMouseNew.sql
     hgsql -e \
 'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm8
     hgsql mm8 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
     hgsql -e \
      'load data local infile "mm8.stsInfoMouse.bed" into table stsInfoMouseNew;' mm8
 
     hgLoadPsl -nobin -table=all_sts_primer mm8 primer/primers.psl.filter
     # load of all_sts_primer did not go as planned: 34563 record(s), 0
     # row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
     #	After warnings, checkTableCoords to find problems:
     checkTableCoords -verboseBlocks mm8 all_sts_primer
 # mm8.all_sts_primer item 61999 chr10:62418012-62418048: blocks 0 and 1 overlap.
 # mm8.all_sts_primer has 1 records with overlapping blocks.
     #	Strip the offending item from the load:
     #	Verify the grep takes out only one item:
     wc -l primer/primers.psl.filter
     #	34563 primer/primers.psl.filter
     grep -P "\t61999\t" primer/primers.psl.filter | wc -l
     #	1
     #	and thus leaves the rest
     grep -v -P "\t61999\t" primer/primers.psl.filter | wc -l
     #	34562
     grep -v -P "\t61999\t" primer/primers.psl.filter > fixed.primers.psl.filter
     hgLoadPsl -nobin -table=all_sts_primer mm8 fixed.primers.psl.filter
 
     # load primer sequences	
     rm /gbdb/mm8/stsMarker/mouseP.fa
     ln -s /cluster/data/mm8/bed/STSmarkers.2006-08-29/mouseP.fa \
 	/gbdb/mm8/stsMarker/mouseP.fa
     # PLEASE NOTE THAT THE If you are going to reload this business, use the
     #	-replace option on this hgLoadSeq
     #	hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
     # otherwise there will be a problem that the seq and extFile tables 
     # will be out of sync. 
     hgLoadSeq -replace mm8 /gbdb/mm8/stsMarker/mouseP.fa
     #  Adding /gbdb/mm8/stsMarker/mouseP.fa
     #  34666 sequences
     #	Warning: load of seq did not go as planned: 34666 record(s),
     #	0 row(s) skipped, 1 warning(s) loading ./seq.tab
 
 
     featureBits mm8 all_sts_primer
     #	3700897 bases of 2567283971 (0.144%) in intersection
     featureBits mm8 all_sts_primerFeb2006
     #	3746196 bases of 2567283971 (0.146%) in intersection
     featureBits mm7 all_sts_primer
     #	3757119 bases of 2583394090 (0.145%) in intersection
     featureBits mm6 all_sts_primer
     #	3677372 bases of 2597150411 (0.142%) in intersection
     featureBits mm8 stsMapMouseNew
     #	4812616 bases of 2567283971 (0.187%) in intersection
     featureBits mm8 stsMapMouseNewFeb2006
     #	4801964 bases of 2567283971 (0.187%) in intersection
     featureBits mm7 stsMapMouseNew
     #	4805958 bases of 2583394090 (0.186%) in intersection
     featureBits mm6 stsMapMouseNew
     #	4638338 bases of 2597150411 (0.179%) in intersection
 
     hgsql -N mm8 -e "select count(*) from stsAlias;"
     #	146767
     hgsql -N mm8 -e "select count(*) from stsAliasFeb2006;"
     #	141981
     hgsql -N mm7 -e "select count(*) from stsAlias;"
     #	140649
     hgsql -N mm7 -e "select count(*) from stsAlias;"
     #	137738
     hgsql -N mm5 -e "select count(*) from stsAlias;"
     #	122944
     hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
     #	60440
     hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
     #	59843
     hgsql -N mm7 -e "select count(*) from stsInfoMouseNew;"
     #	58980
     hgsql -N mm5 -e "select count(*) from stsInfoMouseNew;"
     #	58493
 
     #	compare old and new name lists, not much difference:
     awk '{print $4}' stsMapMouse.bed | sort -u > mm8.nameList
     #	in common with previous version
     comm -12 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
     #	28687
     #	unique to previous version
     comm -23 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
     #	11
     #	unique to this new set
     comm -13 ../STSmarkers/mm8.nameList mm8.nameList | wc -l
     #	20
 
 ##########################################################################
 # N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)
     cd /cluster/data/mm8/bed/nscan/
 
     # obtained NSCAN predictions from michael brent's group
     # at WUSTL
     mv ardor.wustl.edu/jeltje/mm8/chr_ptx .
     rm -rf ardor.wustl.edu
     rm chr_*/index.html*
     gzip chr_*/*
     chmod a-w chr_*/*.gz
 
     # load tracks.  Note that these have *utr features, rather than
     # exon features.  currently ldHgGene creates separate genePred exons
     # for these.
     ldHgGene -bin -gtf -genePredExt mm8 nscanGene chr_gtf/chr*.gtf.gz
 
     # load protein, add .1 suffix to match transcript id
     hgPepPred -suffix=.1 mm8 generic nscanPep chr_ptx/chr*.fa.gz
     rm *.tab
 
     # update trackDb; need a mm8-specific page to describe informants
     mouse/mm8/nscanGene.html   (copy from hg18 and edit)
     mouse/mm8/trackDb.ra
     # changed search regex to
         termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]
 
 
 #####################################################################
 # SEGMENTAL DUPLICATIONS (DONE 9/18/06 angie)
     # File emailed from Ginger Cheng <ginger2@u.washington.edu>
     mkdir /cluster/data/mm8/bed/genomicSuperDups
     cd /cluster/data/mm8/bed/genomicSuperDups
     awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm8_WGAC.tab \
     | hgLoadBed mm8 genomicSuperDups stdin \
       -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
     # 8/29/07 Gak!  Kayla found that the strand values were "+" and "_" -- fix:
     hgsql mm8 -e 'update genomicSuperDups set strand = "-" where strand = "_";'
 
 
 #####################################################################
 # CELERA COVERAGE (WSSD -- DEPTH OF COVERAGE) (DONE 10/16/06 angie)
     # File emailed from Ginger Cheng <ginger2@u.washington.edu>
     mkdir /cluster/data/mm8/bed/wssd
     cd /cluster/data/mm8/bed/wssd
     tail +2 mm8_WSSD_DOC.tab \
     | hgLoadBed mm8 wssdCoverage stdin
 
 
 #####################################################################
 ## NIA Mouse Gene Index - (DONE, Fan, 10/6/06)
 #       requested by: Dudekula, Dawood (NIH/NIA/IRP) DudekulaDB@grc.nia.nih.gov
     ssh hgwdev 
     mkdir -p /cluster/data/mm8/bed/NIAGene061003
     cd /cluster/data/mm8/bed
     ln -s NIAGene061003 NIAGene
     cd NIAGene
     wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-fasta.ff.gz
     wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm8/download/T-psl.txt.gz
     gzip -d *.gz
     
     cut -f 1-21 T-psl.txt >NIAGene.tab
     hgLoadPsl mm8 NIAGene.tab
 
     mkdir /gbdb/mm8/NIAGene
     ln -s /cluster/data/mm8/bed/NIAGene/T-fasta.fa /gbdb/mm8/NIAGene/T-fasta.fa
     
     hgLoadSeq mm8 /gbdb/mm8/NIAGene/T-fasta.fa
 
 # Create/edit/check in NIAGene.html and trackDb.ra under
     
         kent/src/hg/makeDb/trackDb/mouse/mm8
 
 #####################################################################
 # LOAD GENEID GENES (DONE - 2006-10-09 - Fan)
     ssh hgwdev
     mkdir -p /cluster/data/mm8/bed/geneid/download
     cd /cluster/data/mm8/bed/geneid/download
 
     bash
     awk '{print $1}' ../../../chrom.sizes | while read C
     do
       echo $C
       wget --timestamping \
       http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.gtf
       wget --timestamping \
       http://genome.imim.es/genepredictions/M.musculus/mmMar2006/geneid_v1.2/$C.prot
     done
     exit
 
     # Add missing .1 to protein id's
 
     foreach f (*.prot)
       perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
     end
     cd ..
     ldHgGene -genePredExt -gtf mm8 geneid download/*.gtf
 
 #Read 35954 transcripts in 284585 lines in 34 files
 # 35954 groups 34 seqs 1 sources 3 feature types
 # 35954 gene predictions
 
     hgPepPred mm8 generic geneidPep download/*-fixed.prot
     featureBits mm8 -enrichment refGene geneid
 # refGene 1.842%, geneid 1.592%, both 0.883%, cover 47.95%, enrich 30.13x
     featureBits mm7 -enrichment refGene geneid
 # refGene 1.835%, geneid 1.579%, both 0.866%, cover 47.18%, enrich 29.88x
 
 
 #####################################################################
 # RN4 RECIPROCAL BEST CHAINS/NETS (DONE - 2006-10-10 - Angie)
     doRecipBest.pl mm8 rn4 \
       >& /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log &
     tail -f /cluster/data/mm8/bed/blastz.rn4/axtChain/recipBest.log
 
 
 ##############################################################################
 
 ############################################################################
 # Load CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2006-10-10 markd)
 
     cd /cluster/data/genbank/data/ccds/
     ftp ftp-private.ncbi.nih.gov (user ccds, needs password)
     get CCDS.20061010.tar.gz
     mkdir /scratch/tmp/ccds
     cd /scratch/tmp/ccds
     tar -zxf /cluster/data/genbank/data/ccds/CCDS.20061010.tar.gz
 
     # import ccds database tables
     /cluster/data/genbank/bin/x86_64/ccdsImport ccds data/*.txt
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords mm8 -verbose=2 ccdsGene
     joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/kent/src/hg/makeDb/schema/all.joiner
     rm -rf /scratch/tmp/ccds
 
     # build initial version of ccdsMgcMap table
     ./x86_64/mkCcdsGeneMap -loadDb -db=mm8 -loadDb ccdsGene mgcGenes ccdsMgcMap
 
     
     # load trackDb
     cd kent/src/hg/makeDb/trackDb
     make alpha
 
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
 
     # << emacs
 
 
 ############################################################################
 # JAX TRACKS (DONE 10/20/06 angie - UPDATED 7/18/07, 9/27/07)
+# Table jaxQTL renamed to jaxQtl on 1/7/10 (see NOTE FOR NEXT TIME below)
     ssh kkstore04
     mkdir /cluster/data/mm8/bed/jax/2007_09
     cd /cluster/data/mm8/bed/jax/2007_09
     wget ftp://ftp.informatics.jax.org/pub/gbrowse/\*
     wget ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
 
     # Jax Rep Transcript track
     # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
     # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
     # -- aliases ~ MGI:\d+
     # Use simple perl script to uniquify transcript names and make alias.tab.
     # Inspired by the mm6 version, but format has changed.
     ../2007_07/parseRepTranscript.pl SEQ_RepTransGenomic_rpt.gff \
       > jaxRepTranscript.gff
 
     # Jax Allele track
     # AL_*.gff --> jaxAllele{,Info}
     # -- bed12Source -- add type from filename
     # -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
     # -- Info: name, mgiID, source {"Gene trapped", ...}
     cp ../2007_07/parseAllele.pl .
     # Edit to accomodate latest format tweaks.
     rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
     foreach f (AL*.gff)
       set type = `echo $f:t:r \
         | sed -e 's/AL_//; s/GTRAP/GeneTrapped/; s/IND/Induced/; \
             s/OTHER/Other/; s/SPON/Spontaneous/; s/TARG/Targeted/; \
             s/TRANS/Transgenic/;'`
       parseAllele.pl $f \
       | ldHgGene mm8 placeholder stdin -nobin -out=stdout \
       | /cluster/bin/scripts/genePredToBed \
       | sed -e 's/$/'"\t$type"'/' \
       >> jaxAllele.bed
     end
     # This round's formatting inconsistencies:
 #source not given for NM_015770_a<jIs(17_In2)1Gso>
 #source not given for NM_029931_Mllt3<T(4Mllt3_9Mll)1Thr>
 #source not given for NM_009521_Wnt3<In(11Trp53_11Wnt3)8Brd>
 #source not given for NM_011640_Trp53<In(11Trp53_11Wnt3)8Brd>
 #source not given for NM_001081049_Mll1<T(4Mllt3_9Mll)1Thr>
 #Missing > for mRNA name NM_001081193_Lemd3<Gt(XST167)Byg
 
     # Jax Phenotype track
     # MP_*.gff --> jaxPhenotype{,Alias}
     # -- bed12Source -- add type from filename
     # -- names like NM_001001488_Atp8b1
     rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
     foreach f (MP_*.gff)
       set type = `echo $f:t:r \
         | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
                     s@AdiposeTissue@Adipose@; \
                     s@BehaviorNeurological@Behavior@; \
                     s@CardiovascularSystem@Cardiovascular@; \
                     s@DigestiveAlimentary@Digestive@; \
                     s@EndocrineExocrineGland@Gland@; \
                     s@GrowthSize@Growth Size@; \
                     s@HearingEar@Hearing/Ear@; \
                     s@HematopoieticSystem@Hematopoietic@; \
                     s@HomeostasisMetabolism@Homeostasis@; \
                     s@ImmuneSystem@Immune@; \
                     s@LethalityEmbryonicPerinatal@Embryonic Lethal@; \
                     s@LethalityPostnatal@Postnatal Lethal@; \
                     s@LifeSpanPostWeaningAging@Life Span@; \
                     s@LimbsDigitsTail@Limbs and Tail@; \
                     s@LiverBiliarySystem@Liver and Bile@; \
                     s@NervousSystem@Nervous System@; \
                     s@RenalUrinarySystem@Renal/Urinary@; \
                     s@ReproductiveSystem@Reproductive@; \
                     s@RespiratorySystem@Respiratory@; \
                     s@SkinCoatNails@Skin/Coat/Nails@; \
                     s@TasteOlfaction@Taste/Smell@; \
                     s@TouchVibrissae@Touch@; \
                     s@Tumorigenesis@Tumorigenesis@; \
                     s@VisionEye@Vision/Eye@;'`
       echo $type
       ../2006_10/parsePhenotype.pl $f \
       | ldHgGene mm8 placeholder stdin -nobin -out=stdout \
       | /cluster/bin/scripts/genePredToBed \
       | sed -e 's@$@'"\t$type"'@' \
       >> jaxPhenotype.bed
     end
     sort -u jaxPhenotypeAlias.tab > tmp
     mv tmp jaxPhenotypeAlias.tab
 
     # Jax QTL track
     # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
     # and CM distance for 2, or those plus flanking markers for 3...
     perl -wpe 'chomp; s/\s*$//; \
       ($chr, undef, undef, $start, $end, undef, $strand, undef, $info) = \
         split("\t"); \
       if ($info =~ /QTL (\w+);  Dbxref "(MGI:\d+)"; Alias .*;  Note "([^"]+)"/) { \
         ($name, $mgiID, $desc) = ($1, $2, $3); \
       } else { die "parse\n$info"; } \
       $start--; \
       s/^.*$/$chr\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
       QTL_build36_03_alias.gff > jaxQtl.bed
 
     # Extract phenotype-allele relationships:
     # Make a file for the one code not already in a filename:
     cp /dev/null MP_0003012_no_phenotypic_analysis
     # Wrote a script to extract the phenotype-allele relationships --
     # it uses the filenames to map MP:* codes to our phenotype names.
     ../2007_07/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt \
       > jaxAllelePheno.tab
     # The file "err" has messages about missing data (no gene name in 
     # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).
 
     # Load tables
     ssh hgwdev
     cd /cluster/data/mm8/bed/jax/2007_09
     # jaxRepTranscript
     ldHgGene mm8 jaxRepTranscript jaxRepTranscript.gff
     hgsql mm8 < fixJaxRepTranscript.sql
     sed -e 's/genericAlias/jaxRepTranscriptAlias/g' \
       ~/kent/src/hg/lib/genericAlias.sql > jaxRepTranscriptAlias.sql 
     hgLoadSqlTab mm8 jaxRepTranscriptAlias \
       jaxRepTranscriptAlias.sql jaxRepTranscriptAlias.tab
     # jaxAllele
     sed -e 's/bed12Source/jaxAllele/g' \
       $HOME/kent/src/hg/lib/bed12Source.sql > jaxAllele.sql
     hgLoadBed -sqlTable=jaxAllele.sql mm8 jaxAllele jaxAllele.bed
     hgsql mm8 < fixJaxAllele.sql
     hgLoadSqlTab mm8 jaxAlleleInfo \
       ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
     # jaxPhenotype
     sed -e 's/bed12Source/jaxPhenotype/g' \
       $HOME/kent/src/hg/lib/bed12Source.sql > jaxPhenotype.sql
     hgLoadBed -tab -sqlTable=jaxPhenotype.sql mm8 jaxPhenotype jaxPhenotype.bed
     hgsql mm8 < fixJaxPhenotype.sql
     sed -e 's/genericAlias/jaxPhenotypeAlias/' \
       ~/kent/src/hg/lib/genericAlias.sql > jaxPhenotypeAlias.sql
     hgLoadSqlTab mm8 jaxPhenotypeAlias \
       jaxPhenotypeAlias.sql jaxPhenotypeAlias.tab
 ### NOTE FOR NEXT TIME ###
 ### Call the table jaxQtl instead of jaxQTL -- QA doesn't like jaxQTL.
+### (brooke) In fact, QA renamed the table to jaxQtl on 1/7/10 on hgwdev and
+### mysqlbeta with this command:  mysql> alter table jaxQTL rename to jaxQtl;
+### (to make trackDb load with a single trackDb.ra entry for mm8 and mm9)
 ### Use -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql .
     # jaxQTL
     hgLoadBed -tab -notItemRgb -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/jaxQTL.sql \
       mm8 jaxQTL jaxQtl.bed
     checkTableCoords -verbose=2 mm8 jaxQTL
 #mm8.jaxQTL item Scpro11 chr18:131504376-131504512: chromEnd > chromSize 90736837
 #mm8.jaxQTL item Tswt chr18:134822025-134822132: chromEnd > chromSize 90736837
 #mm8.jaxQTL item Ath13 chr14:164794113-164794369: chromEnd > chromSize 123978870
 #mm8.jaxQTL item Dob7 chr11:131434708-131434798: chromEnd > chromSize 121798632
     # Fix coords > chromSize:
     perl -wpe 's/^(\w+)\t(\d+)$/ \
       delete from jaxQTL where chrom="$1" and chromStart >= $2; \
       update jaxQTL set chromEnd = $2 where chrom="$1" and chromEnd > $2;/' \
       ../../../chrom.sizes \
     | hgsql mm8
     checkTableCoords -verbose=2 mm8 jaxQTL
     # phenotype-allele relationships
     hgLoadSqlTab mm8 jaxAllelePheno \
       ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab
 
     # Check joiner:
     runJoiner.csh mm8 jaxAllele
     runJoiner.csh mm8 jaxPhenotype
 
 
 ##########################################################################
 # SWAP/CHAIN/NET GASACU1 (DONE 10/23/06 angie)
     ssh kkstore04
     mkdir /cluster/data/mm8/bed/blastz.gasAcu1.swap
     cd /cluster/data/mm8/bed/blastz.gasAcu1.swap
     doBlastzChainNet.pl -swap /cluster/data/gasAcu1/bed/blastz.mm8/DEF \
       -chainMinScore=2000 -chainLinearGap=loose >& do.log & tail -f do.log
     ln -s blastz.gasAcu1.swap /cluster/data/mm8/bed/blastz.gasAcu1
     nice featureBits mm8 chainGasAcu1Link
 #52781141 bases of 2567283971 (2.056%) in intersection
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET FELCAT3 (Done Nov 15 2006 heather)
 # working in /cluster/data/felCat3 because /cluster/data/mm8 is 94% full
     mkdir /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
     ln -s /cluster/data/felCat3/bed/blastz.mm8.2006-11-14 /cluster/data/mm8/bed/blastz.felCat3
     cd /cluster/data/felCat3/bed/blastz.mm8.2006-11-14
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Mouse mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Cat felCat3 
 SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/felCat3/chrom.sizes
 # Maximum number of scaffolds that can be lumped together
 SEQ2_LIMIT=500
 SEQ2_CHUNK=30000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/felCat3/bed/blastz.mm8.2006-11-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
 
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk
       -chainMinScore=3000 -chainLinearGap=medium
       -blastzOutRoot /cluster/bluearc/felCat3/blastz.mm8 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 mm8 chainFelCat3Link
     36333124 bases of 191450312 (18.978%) in intersection
 
 
 #########################################################################
 # BLASTZ/CHAIN/NET BOSTAU3 (Done March 2007 heather)
     mkdir /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
     ln -s /cluster/data/mm8/bed/blastz.bosTau3.2007-03-14 /cluster/data/mm8/bed/blastz.bosTau3
     cd /cluster/data/mm8/bed/blastz.bosTau3
     cat << '_EOF_' > DEF
 
 BLASTZ_M=50
 
 # TARGET: Mouse mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=10000000
 SEQ1_LAP=10000
 
 # QUERY: Cow bosTau3
 SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
 SEQ2_LEN=/san/sanvol1/scratch/bosTau3/chrom.sizes
 SEQ2_LIMIT=500
 SEQ2_CHUNK=50000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastz.bosTau3.2007-03-14
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/bosTau3/blastz.mm8 >& do.log &
     tail -f do.log
 
     nice featureBits -chrom=chr1 mm8 chainBosTau3Link
     # 49896121 bases of 191450312 (26.062%) in intersection
 
 
 #############################################################################
 #  REBUILD miRNA TRACK (DONE - 2006-12-01 - Fan)
     #   updated data from: Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir miRNA-2006-12-01
     cd miRNA-2006-12-01
     # save the mmu8_miRNA.txt file from email
 
     # add the following line in mmu8_miRNA.txt per email from Michel.
 
     chrM      16114   16209   mmu-mir-805     480     -
     
     hgLoadBed -strict mm8 miRNA  mmu8_miRNA.txt 
 
 # check previous release track before update
     featureBits mm8 miRNA
     # 33033 bases of 2567283971 (0.001%) in intersection
     featureBits mm7 miRNA
     # 20620 bases of 2583394090 (0.001%) in intersection
 
 #############################################################################
 # Create Allen Brain Atlas mapping. (Done 2007-02-08 Galt)
 # We are creating several things: a psl probe-track for the RR on mouse,
 # a link out from kg to the probe to the ABA website, 
 # and a set of gene/probe info which visiGene will use.
 # (This needs to be done after have created sequences in
 # ncbiXm and tigrMgiTc as above.)
 
 # metadata.log and SRGEsequence.log was provided by 
 #  Susan Sunkin <SusanS@alleninstitute.org>
 # this is an update to the visiGene with 6000 new images.
 
 # See mm6.txt for steps not needing to be repeated.
 
 # copy in the data files (directory already exists from previous build)
     ssh hgwdev
     cd /cluster/data/mm8/bed/allenBrain
     mkdir old
     mv * old/
     cp /cluster/data/mm6/bed/allenBrain/allen20061204.tab .
     cp /cluster/data/mm6/bed/allenBrain/probeSeq.20061204.fasta .
     cp /cluster/data/mm6/bed/allenBrain/allProbes.fa .
     cp /cluster/data/mm6/bed/allenBrain/allProbes.tab .
     cp /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .
 
 
 # Set up a blat run to align the probes.
     ssk pk
     cd /cluster/data/mm8/bed/allenBrain
     mkdir split
     faSplit sequence allProbes.fa 200 split/rp
     mkdir run
     cd run
     ls -1 ../split/*.fa > mrna.lst
     ls -1 /scratch/hg/mm8/nib/*.nib > genome.lst
     mkdir psl
     cat << '_EOF_' > gsub
 #LOOP
 blat -ooc=/scratch/hg/h/mouse11.ooc $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}
 #ENDLOOP
 '_EOF_'
     # << happy emacs
 
     gensub2 genome.lst mrna.lst gsub spec
     para create spec
 # Then do the usual para try/push/time/check until the run is finished
 #Completed: 6596 of 6596 jobs
 #CPU time in finished jobs:      27258s     454.30m     7.57h    0.32d  0.001 y
 #IO & Wait Time:                 19700s     328.33m     5.47h    0.23d  0.001 y
 #Average job time:                   7s       0.12m     0.00h    0.00d
 #Longest running job:                0s       0.00m     0.00h    0.00d
 #Longest finished job:              39s       0.65m     0.01h    0.00d
 #Submission to last job:           549s       9.15m     0.15h    0.01d
 
 # Then do sorting and near-best-in-genome step on file server
     ssh kkstore
     cd /cluster/data/mm8/bed/allenBrain/run
     pslSort dirs raw.psl tmp psl
     pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 -nearTop=0.001 /dev/null
     sort -k 14,14 -k 16,16n ../best.psl > ../allenBrainAli.psl
 
 # Clean up big files no longer needed
     rm raw.psl
     rm -r psl
     rm -r ../split
 
 # Load up database
     ssh hgwdev
     cd /cluster/data/mm8/bed/allenBrain
 
 # Make a new table that contains the URLs for the allen brain genes
 # Make this one first since all.joiner considers it the master table.
 
     hgsql mm8 -e 'drop table allenBrainUrl'
     hgsql mm8 < ~/kent/src/hg/lib/allenBrainUrl.sql
     hgsql mm8 -e 'load data local infile "allenBrainUrl.tab" into table allenBrainUrl'
 
 # Make probe alignment table, and load sequence.
     hgLoadPsl mm8 allenBrainAli.psl
     rm /gbdb/mm8/allenBrain/allProbes.fa
     ln -s /cluster/data/mm8/bed/allenBrain/allProbes.fa /gbdb/mm8/allenBrain/allProbes.fa
     hgLoadSeq -replace mm8 /gbdb/mm8/allenBrain/allProbes.fa
 
 # Make mapping between known genes and allenBrain
     hgMapToGene mm8 allenBrainAli -type=psl knownGene knownToAllenBrain 
 
 
 
 ##########################################################################
 #  xxBlastTab - Help filter out unwanted paralogs  (Galt 2007-01-11)
 #
 # We are starting with xxBlastTab tables already built in the usual way with
 # blastall/blastp, probably with doHgNearBlastp.pl script.
 #
 # we want to update mm8 for human and rat, 
 # so check ./hgGeneData/Mouse/mm8/otherOrgs.ra for current settings
 
 ssh hgwdev
 
 synBlastp.csh mm8 hg18 
 #mm8.hgBlastTab
 #new number of unique query values:
 #25178
 #new number of unique target values
 #15328
 #old number of unique query values:
 #28286
 #old number of unique target values
 #15901
 
 
 synBlastp.csh mm8 rn4
 #mm8.rnBlastTab:
 #new number of unique query values:
 #11163
 #new number of unique target values
 #6573
 #old number of unique query values:
 #23183
 #old number of unique target values
 #6890
 
 
 
 ##########################################################################
 # GenBank gbMiscDiff table (markd 2007-01-10)
 # Supports `NCBI Clone Validation' section of mgcGenes details page
 
    # genbank release 157.0 now contains misc_diff fields for MGC clones
    # reloading mRNAs results in gbMiscDiff table being created.
    ./bin/gbDbLoadStep -reload -srcDb=genbank -type=mrna mm8
 
 ##########################################################################
 ## WindowMasker (DONE - 2007-01-30 - Hiram)
     ssh kolossus
     mkdir /cluster/data/mm8/bed/WindowMasker.2007-01-29
     cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
     # copy *.csh scripts from
     # /cluster/data/danRer4/bed/WindowMasker.2006-12-04
     #	and fixup the db name and work directory in those scripts, then:
     time nice -n +19 ./doCount.csh > doCount.out 2>&1
     #	real    67m32.178s
     time nice -n +19 ./doSdust.csh >doSdust.out 2>&1
     #	real    477m24.667s
     ssh kkstore04
     cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
     gzip windowmasker.sdust.bed
     time nice -n +19 ./applyMask.csh > applyMask.out 2>&1
     time nice -n +19 ./addTrf.csh > addTrf.out 2>&1
     twoBitToFa mm8.sdTrf.2bit stdout | faSize stdin
     #	2664455088 bases (97171400 N's 2567283688 real 1644888505 upper
     #	922395183 lower) in 34 sequences in 1 files
     ssh hgwdev
     cd /cluster/data/mm8/bed/WindowMasker.2007-01-29
     
 
 ##########################################################################
 ## AUGUSTUS ab initio predictions (DONE, 2007-01-30 - Mario)
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/augustus
     cd /cluster/data/mm8/bed/augustus
 
     # get the program AUGUSTUS, e.g. from the web
     wget http://augustus.gobics.de/binaries/augustus.2.0.1.src.tar.gz
     # unpack
     tar xzf augustus.2.0.1.src.tar.gz
 
     # compile the binary if necessary
     cd augustus/src
     make augustus
 
     # create output directory
     cd /cluster/data/mm8/bed/augustus
     mkdir out err
 
     # create file with sequences and their sizes by modifying chrom.sizes
     cat ../../chrom.sizes | perl -e 'while(<>){s/chr([0-9a-zA-Z]+)(_random|)/\/cluster\/data\/mm8\/$1\/chr$1$2.fa.masked/; print;}' > seq.lst
 
     # create the job list
     augustus/scripts/createAugustusJoblist.pl --sequences seq.lst --chunksize 5300000 --overlap 300000 --command "/cluster/data/panTro2/bed/augustus/augustus/src/augustus --AUGUSTUS_CONFIG_PATH=/cluster/data/panTro2/bed/augustus/augustus/config --species=human --sample=100 --/augustus/verbosity=0" --outputdir /cluster/data/mm8/bed/augustus/out/ --errordir /cluster/data/mm8/bed/augustus/err/ --joblist job.lst
 
     para try
     para check
     para push
 
 # CPU time in finished jobs:    2984823s   49747.06m   829.12h   34.55d  0.095 y
 # IO & Wait Time:                 19258s     320.96m     5.35h    0.22d  0.001 y
 # Average job time:                5403s      90.05m     1.50h    0.06d
 # Longest running job:                0s       0.00m     0.00h    0.00d
 # Longest finished job:            7896s     131.60m     2.19h    0.09d
 # Submission to last job:         15716s     261.93m     4.37h    0.18d
 
     # check the error files, should be no errors
     cat err/*.err
 
     cat out/*.gff | augustus/scripts/join_aug_pred.pl > augustus.pep.gff
     augustus/scripts/getAnnoFasta.pl augustus.pep.gff
     cat augustus.pep.gff | egrep "CDS|codon"> augustus.gff
 
     # load into database
 
     ssh hgwdev
     cd /cluster/data/panTro2/bed/augustus/
     ldHgGene -bin mm8 augustus augustus.gff
     # 32377 gene predictions
 
     hgPepPred panTro2 generic augustusPep augustus.pep.aa
 
     featureBits mm8 augustus
     # 35380585 bases of 2567283971 (1.378%) in intersection
 
 #########################################################################
 ## BLASTZ ANOCAR1 - Lizard - (DONE - 2007-02-19 - 2007-02-20 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
     cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
 
     cat << '_EOF_' > DEF
 # Mouse vs lizard
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Mouse Mm8
 SEQ1_DIR=/san/sanvol1/scratch/mm8/mm8.sdTrf.2bit
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=50000000
 SEQ1_LAP=10000
 
 # QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
 SEQ2_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl DEF -chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=pk \
 	-blastzOutRoot /cluster/bluearc/mm8AnoCar1 > do.log 2>&1 &
     #	real    544m52.722s
 
     #	appears to have successfully finished
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastz.anoCar1.2007-02-19
     time nice -n +19 featureBits mm8 chainAnoCar1Link \
 	> fb.mm8.chainAnoCar1Link.txt 2>&1
     #	real    1m37.380s
     #	96286498 bases of 2567283971 (3.751%) in intersection
 
     #	running the swap to anoCar1 - instructions in anoCar1.txt
     cd /cluster/data/anoCar1/bed/blastz.mm8.swap
     time nice -n +19 featureBits anoCar1 chainMm8Link \
 	> fb.anoCar1.chainMm8Link.txt 2>&1
     #	real    2m1.527s
     #	82784787 bases of 1741478929 (4.754%) in intersection
 
 #############################################################################
 # UPDATED mm8.knownToVisiGene (DONE galt 2007-02-15)
 
 
 #########################################################################
 # BLASTZ ORNANA1 (PLATYPUS) - (DONE 2007-03-02 angie)
     ssh kkstore04
     mkdir /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
     cd /cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
 
     cat << '_EOF_' > DEF
 # mouse vs. platypus
 
 # Use same params as used for hg18-danRer4
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=6000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: mm8
 SEQ1_DIR=/scratch/hg/mm8/nib
 SEQ1_LEN=/scratch/hg/mm8/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 
 # QUERY: ornAna1
 SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
 SEQ2_LEN=/iscratch/i/ornAna1/chrom.sizes
 SEQ2_CHUNK=20000000
 SEQ2_LIMIT=300
 SEQ2_LAP=0
 
 BASE=/cluster/data/mm8/bed/blastz.ornAna1.2007-02-27
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << emacs
 
     doBlastzChainNet.pl DEF \
       -workhorse kkr6u00 \
       -blastzOutRoot /cluster/bluearc/mm8.ornAna1 \
       >& do.log & tail -f do.log
 
 
 ############################################################################
 # Reload CCDS (ccdsInfo, ccdsGene, ccdsKgMap) (2007-03-02 markd)
 
     # see hg17.txt for build temporary ccds database for CCDS.20070228
 
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds mm8 ccdsInfo ccdsGene
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm8 -loadDb ccdsGene knownGene ccdsKgMap
     checkTableCoords mm8 -verbose=2 ccdsGene
     # update all.jointer to include mm8 in ccdsDb
     joinerCheck -database=mm8 -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -loadDb -db=mm8 ccdsGene mgcGenes ccdsMgcMap
     
     # load trackDb
     cd kent/src/hg/makeDb/trackDb
     make alpha
     # check in browser
 
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 
 ############################################################################
 # CGAP SAGE (DONE Andy 2007-03-01)
 ssh hgwdev
 cd san/andy/mouseSage/
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_forward_v36.1.tar.gz
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/SAGE_mm_long_reverse_v36.1.tar.gz
 tar xvfz SAGE_mm_long_forward_v36.1.tar.gz 
 tar xvfz SAGE_mm_long_reverse_v36.1.tar.gz 
 rm *.tar.gz
 chmod a+r -R mm_* 
 chmod +x mm_*
 cd mm_forward/
 cat * | awk 'BEGIN{OFS="\t"}{print $1, $3, $4, $2, 1000, "+"}' > ../unlifted.bed
 cd ../mm_reverse/
 cat * | awk 'BEGIN{OFS="\t"}{print $1, $4, $3, $2, 1000, "-"}' >> ../unlifted.bed
 ctgPosToLft mm8 mm8.lft
 liftUp lifted.bed mm8.lft warn unlifted.bed 
 awk 'BEGIN{OFS="\t"}{strand = $6; start = $2; end = $3; if (strand == "-") { thickStart = end; } else { start = start - 1; thickStart = start
  - 4; } thickEnd = thickStart + 4; print $1, start, end, $4, $5, strand, thickStart, thickEnd; }' lifted.bed > mapping.bed
 gunzip *.gz
 rm -rf mm_forward/ mm_reverse/ unlifted.bed lifted.bed mm8.lft 
 awk 'BEGIN{FS="\t"}{sex = $13; for (i=1; i<=12; i++) { printf("%s\t", $i); } if (sex == "unknown") { sex = ""; } else if (sex == "male and fe
 male") { sex = "male,female,"} else if (sex == "male") { sex = "male,"} else {sex = "female,"}; printf("%s\t", sex); for (i=14; i<=20; i++) {
  printf("%s\t", $i); } print $21}' Mm.libraries | tail +2 > massaged.Mm.libraries
 cgapSageBedAddFreqs -noEmpty mapping.bed Mm_long.frequencies massaged.Mm.libraries cgapSage.bed
 ln -s ~/hg/lib/cgapSage/cgapSageLib.sql 
 ln -s ~/hg/lib/cgapSage/cgapSage.sql 
 hgLoadBed -sqlTable=cgapSage.sql mm8 cgapSage cgapSage.bed
 hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql massaged.Mm.libraries 
 
 ############################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-04-02)
     ssh kkstore04
     bash 
 
     mkdir /cluster/data/mm8/blastDb
     cd /cluster/data/mm8
     ls noMask/*.fa | grep -v random > temp.lst
     ls randomContigs/*.fa >> temp.lst
     cat `cat temp.lst` > temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /san/sanvol1/scratch/mm8/blastDb
     cd /cluster/data/mm8/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/mm8/blastDb
     done
 
     mkdir -p /cluster/data/mm8/bed/tblastn.hg18KG
     cd /cluster/data/mm8/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/mm8/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 2733 query.lst
 
    # we want around 150000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(150000/`wc query.lst | awk "{print \\\$1}"`\)
 # 36727/(150000/2733) = 669.165940
 
    mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa
    split -l 670 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/mm8/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm8/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
 
         if pslCheck -prot $3.tmp
         then
             mv $3.tmp $3
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0 
     fi
 fi 
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     exit # back to bash
     
     ssh pk
     cd /cluster/data/mm8/bed/tblastn.hg18KG
     para create blastSpec
 
     para time
 # Completed: 150315 of 150315 jobs
 # CPU time in finished jobs:   24349624s  405827.07m  6763.78h  281.82d  0.772 y
 # IO & Wait Time:               1825515s   30425.24m   507.09h   21.13d  0.058 y
 # Average job time:                 174s       2.90m     0.05h    0.00d
 # Longest finished job:             673s      11.22m     0.19h    0.01d
 # Submission to last job:         79743s    1329.05m    22.15h    0.92d
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/tblastn.hg18KG
     for i in blastOut/*
     do  
 	echo "cd $i; cat *.psl | pslSortAcc nohead chrom /tmp/ stdin ; cd ../.."
     done > sort.jobs
 
     sh -x sort.jobs
 
     tcsh
     mkdir chainRun
     cd chainRun
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 /cluster/home/braney/bin/x86_64/simpleChain -prot -outPsl -maxGap=150000 $1
 `dirname $1`/c.`basename $1`.psl
 '_EOF_'
     chmod +x chainOne
     ls  ../blastOut/*/chrom/*.psl > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh pk
     cd /cluster/data/mm8/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 
 #two batches 
 
 # Completed: 2574 of 2574 jobs
 # CPU time in finished jobs:    3338223s   55637.04m   927.28h   38.64d  0.106 y
 # IO & Wait Time:                 21934s     365.57m     6.09h    0.25d  0.001 y
 # Average job time:                1305s      21.76m     0.36h    0.02d
 # Longest finished job:           88204s    1470.07m    24.50h    1.02d
 # Submission to last job:         92614s    1543.57m    25.73h    1.07d
 
 # Completed: 2871 of 2871 jobs
 # CPU time in finished jobs:    2495054s   41584.24m   693.07h   28.88d  0.079 y
 # IO & Wait Time:                 47207s     786.78m    13.11h    0.55d  0.001 y
 # Average job time:                 885s      14.76m     0.25h    0.01d
 # Longest finished job:           59971s     999.52m    16.66h    0.69d
 # Submission to last job:         78852s    1314.20m    21.90h    0.91d
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
     bash 
     for i in kg??
     do
        cat $i/chrom/c.*.psl|awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/mm8/bed/tblastn.hg18KG/preLift.psl
     cd /cluster/data/mm8/bed/tblastn.hg18KG 
     liftUp -type=.psl -nohead stdout ../../jkStuff/liftAll.lft carry preLift.psl | sort -k 14,14 -k 16,16n -k 17,17n > blastHg18KG.psl
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/mm8/bed/tblastn.hg18KG
     hgLoadPsl mm8 blastHg18KG.psl
 
     # check coverage
     nice featureBits mm8 blastHg18KG 
 # 40445290 bases of 2567283971 (1.575%) in intersection
 # In comparison to cat and dog:
     nice featureBits felCat3  blastHg18KG
 # 15218612 bases of 1642698377 (0.926%) in intersection
     nice featureBits canFam2 blastHg18KG
 # 32565727 bases of 2384996543 (1.365%) in intersection
 
     featureBits mm8 refGene:cds blastHg18KG  -enrichment
 # refGene:cds 1.157%, blastHg18KG 1.575%, both 0.927%, cover 80.15%, enrich
 # 50.88x
 
     ssh kkstore04
     rm -rf /cluster/data/mm8/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/mm8/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 # EXONIPHY MM8, lifted from hg18 (DONE acs 2007-04-08)
 
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir exoniphy
     cd exoniphy
     hgLoadGenePred -genePredExt mm8 exoniphy exoniphyMm8.gp
 
     # exoniphyMm8.gp was prepared at Cornell as follows
     hgsql hg18 -e "select * from exoniphy" --skip-column-names > exoniphyHg18.gp
     liftOver -genePred exoniphyHg18.gp /usr/data/hg18/dbDerived/netSynteny/hg18.mm8.syn.chain exoniphyMm8.gp unmapped
     (where hg18.mm8.syn.chain representes the human/mouse syntenic net)
 
 #########################################################################
 # BLASTZ/CHAIN/NET HORSE (DONE 2/21/07 Fan)
     ssh kkstore05
     mkdir /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
     cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
     cat << '_EOF_' > DEF
 # Horse vs. Mouse
 
 BLASTZ_M=50
 
 # TARGET: Horse equCab1
 SEQ1_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/equCab1/chrom.sizes       
 # Maximum number of scaffolds that can be lumped together
 SEQ1_LIMIT=500     
 SEQ1_CHUNK=30000000
 SEQ1_LAP=10000
 
 # QUERY: Mouse mm8
 SEQ2_DIR=/scratch/hg/mm8/mm8.2bit
 SEQ2_LEN=/cluster/data/mm8/chrom.sizes 
 SEQ2_CHUNK=10000000
 SEQ2_LAP=0
 
 BASE=/cluster/data/equCab1/bed/blastz.mm8.2007-02-17
 TMPDIR=/scratch/tmp
 '_EOF_'
 # Fix script coloring _EOF_ 
     # << this line keeps emacs coloring happy
     doBlastzChainNet.pl DEF \
       -bigClusterHub pk \
       -chainMinScore=3000 -chainLinearGap=medium \
       -blastzOutRoot /cluster/bluearc/equCab1/blastz.mm8 >& do.log &
     tail -f do.log
 
     ssh hgwdev
     cd /cluster/data/equCab1/bed/blastz.mm8.2007-02-17
     ln -s blastz.mm8.2007-02-17 /cluster/data/equCab1/bed/blastz.mm8
     nice featureBits equCab1 -chrom=chr1 chainMm8Link
 # 70800969 bases of 177498097 (39.888%) in intersection
 
     bash
     time nice -n 19 featureBits equCab1 chainMm8Link \
 	> fb.equCab1.chainMm8Link.txt 2>&1
 # 903993981 bases of 2421923695 (37.325%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/mm8/bed/blastz.equCab1.swap
     cd /cluster/data/mm8/bed/blastz.equCab1.swap
     bash
     time doBlastzChainNet.pl \
 	/cluster/data/equCab1/bed/blastz.mm8.2007-02-17/DEF \
 	-chainMinScore=3000 -chainLinearGap=medium \
 	-verbose=2 -swap -bigClusterHub=pk > swap.log 2>&1 &
     tail -f swap.log
 # real    76m34.873s
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/blastz.equCab1.swap
     bash
     time nice -n 19 featureBits mm8 chainEquCab1Link \
 	> fb.mm8.chainEquCab1Link.txt 2>&1
     # 906568751 bases of 2567283971 (35.312%) in intersection
 
 #########################################################################
 # CGAP SAGE (Done 2007-05-04)
 
 ssh hgwdev
 cd /san/sanVol1/scratch/andy
 mkdir cgapSage.mm8
 cd cgapSage.mm8
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm.libraries.gz
 wget ftp://ftp1.nci.nih.gov/pub/SAGE/MOUSE/Mm_long.frequencies.gz
 hgsql -e 'select * from snp126 where class="single" and locType="exact"' mm8 \
     | tail +2 | cut -f2- > snps.txt
 hgsql -e 'select name from snp126Exceptions where exception="ObservedWrongSize" 
     or exception="SingleClassBetweenLocType" or exception="SingleClassRangeLocType" 
     or exception="MultipleAlignment"' mm8 \
     | tail +2 > exceptions
 tabGrep -v exceptions 4 snps.txt > tmp
 mv tmp snps.txt
 rm exceptions
 hgsql -e 'select chrom,chromStart,chromEnd,name from simpleRepeat' mm8 | tail +2 > trf.bed
 cut -f1-4 snps.txt > snps.bed
 overlapSelect -nonOverlapping trf.bed snps.bed /dev/stdout | cut -f4 > goodSnps.txt
 tabGrep goodSnps.txt 4 snps.txt > tmp
 mv tmp snps.txt
 rm trf.bed goodSnps.txt snps.bed
 ln -s /cluster/data/mm8/mm8.2bit
 ln -s /cluster/data/mm8/chrom.sizes
 ln -s ~/kent/src/hg/lib/cgapSage/cgapSageLib.sql
 tail +2 Mm.libraries | awk -f cleanLibs.awk > libs.txt
 hgLoadSqlTab mm8 cgapSageLib cgapSageLib.sql libs.txt
 partitionSequence.pl -lstDir small 5000000 30 mm8.2bit chrom.sizes 0 > sequence.lst
 grep -v small sequence.lst > seq.lst
 cat small/* >> seq.lst
 mv seq.lst sequence.lst
 rm -rf small/
 for part in `cat sequence.lst`; do ./doJobList.sh $part >> jobList; done
 ssh pk
 cd /san/sanVol1/scratch/andy/cgapSage.mm8
 para create jobList
 para try
 para push
 # takes like 5-10 min
 exit # back to hgwdev
 find output/ -name '*.bed' -exec cat '{}' >> output.bed \;
 cgapSageDupeRemove output.bed tmp.bed
 cgapSageDupeRemove -unique tmp.bed final.bed
 ln -s ~/kent/src/hg/lib/cgapSage/cgapSage.sql
 hgLoadBed -sqlTable=cgapSage.sql -tab mm8 cgapSage final.bed 
 
 #############################################################################
 #  REBUILD miRNA TRACK (DONE - 2007-05-31 - Fan)
     #   updated data from: Michel.Weber@ibcg.biotoul.fr
     #   notify them when done.
     ssh hgwdev
     cd /cluster/data/mm8/bed
     mkdir miRNA-2007-05-31
     cd miRNA-2007-05-31
     # save the mouse_miRNA_track_may2007.txt file from email
 
     cat mouse_miRNA_track_may2007.txt|sed -e 's/ /\t/g' > miRNA.tab
 
     hgLoadBed mm8 miRNA miRNA.tab
 
 # check previous release track before update
     featureBits mm8 miRNA
     #33398 bases of 2567283971 (0.001%) in intersection
 
     featureBits mm7 miRNA
     # 20620 bases of 2583394090 (0.001%) in intersection
 
 
 #############################################################################
 # LIFTOVER TO MM9 (DONE 7/25/07 angie)
     ssh kkstore04
     # -debug run to create run dir, preview scripts...
     doSameSpeciesLiftOver.pl -debug mm8 mm9 \
       -ooc /san/sanvol1/scratch/mm8/11.ooc
     # Real run:
     cd /cluster/data/mm8/bed/blat.mm9.2007-07-24
     doSameSpeciesLiftOver.pl mm8 mm9 \
       -ooc /san/sanvol1/scratch/mm8/11.ooc \
       >& do.log & tail -f do.log
 
 
 #############################################################################
 # CONTRAST GENES (2007-10-02 markd)
 # recieved predictions from Sam Gross <ssgross@stanford.edu>
 
     cd /cluster/data/mm8/bed/contrastGene/
     wget http://www.stanford.edu/~ssgross/contrast.mm8.bed
     # this is a custom track, not a pure BED
     tail +2 contrast.mm8.bed | hgLoadBed -tab mm8 contrastGene stdin
 
     # verify 
     # load track db (ra and contrastGene.html are global
     # request push of contrastGene
 
 ###########################################################################
 #  loading affy mouse Exon probes and transcripts (DONE - 2007-10-04 - Hiram)
     # data was supplied from Venu Valmeekam Venu_Valmeekam@affymetrix.com
     #	dropped via FTP to genome-test
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/affyMoEx1
     cd /cluster/data/mm8/bed/affyMoEx1
     # the files received:
 # -rw-r--r--  1  8909954 Oct  3 10:48 transcript_cluster_mm.bed.gz
 # -rw-r--r--  1 48178714 Oct  4 13:35 probe_mm_score.bed.gz
     # loading:
     hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Probe probe_mm_score.bed.gz
     #	Loaded 4549897 elements of size 6
     hgLoadBed -tmpDir=/scratch/tmp mm8 affyMoEx1Transcript \
 	transcript_cluster_mm.bed.gz
     # Loaded 270140 elements of size 12
     # working on description pages for these with Venu.
 
     #	I manually set the scores in the affyMoEx1Transcript track to
     #	1000 so it would work OK (not color) with the useScore 1 so that
     #	the affyMoEx1Probe would color itself on the score
 
 ###########################################################################
 # LIFT RM ALIGN FILES, MAKE PER-CHROM DOWNLOADS (DONE 12/7/07 angie)
 # Lifting of .align files is now automated by doRepeatMasker.pl, but we
 # got a user request for .align files from this pre-automation db.
     ssh kkstore04
     cd /cluster/data/mm8
     mkdir downloads/RMalign
     foreach c (?{,?})
       echo linking/lifting to contigs of $c:t
       foreach ctgdir ($c/chr$c{,_random}_?{,?})
         set ctg = $ctgdir:t
         if (! -f $ctgdir/$ctg.fa.align) then
           pushd $ctgdir
           liftRMAlign.pl $ctg.lft > $ctg.fa.align
           popd
         endif
         ln -s $ctg/$ctg.fa.align $c/
       end
       set chr = chr$c:t
       if (-e $c/lift/ordered.lft && ! -z $c/lift/ordered.lft) then
         echo lifting contigs to chr$c
         liftRMAlign.pl $c/lift/ordered.lft \
         | gzip -c > downloads/RMalign/$chr.fa.align.gz
       endif
       if (-e $c/lift/random.lft && ! -z $c/lift/random.lft) then
         echo lifting contigs to chr${c}_random
         liftRMAlign.pl $c/lift/random.lft \
         | gzip -c > downloads/RMalign/${chr}_random.fa.align.gz
       endif
     end
     # Got some messages like these for chunks that fall entirely
     # within gaps (e.g. centromere, huge unbridged...)
 #FYI Couldn't open chr1_1_00.fa.align: No such file or directory
 #...
 #FYI Couldn't open chr1_1_05.fa.align: No such file or directory
 #FYI Couldn't open chr1_17_02.fa.align: No such file or directory
 #...
 
     md5sum downloads/RMalign/*.gz > downloads/RMalign/md5sum.txt
     ssh hgwdev ln -s /cluster/data/mm8/downloads/RMalign \
       /usr/local/apache/htdocs/goldenPath/mm8/
 
 
 ############################################################################
 # Reload CCDS (2007-12-12 markd)
     # import ccds database as described in ccds.txt
     set db=mm8
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     # build initial version of ccdsMgcMap table, updated by nightly genbank update
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
         ccdsMgcMap
     # << emacs
 
 ############################################################################
 # Reload CCDS (2008-02-01 markd)
     # import ccds database as described in ccds.txt
     set db=mm8
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 
 ############################################################################
 # Broad whole-genome ChIP-Seq in stem and progenitor cells
 # Mikkelson et al., Nature Aug. 2, 2007
 # Requested by David Haussler
 # 21 data sets, ~4M sequences/dataset
 #     7 antibodies (histone meth & pol2), 
 #     4 cell sources (ES, NP, MEF, ES+)
 # alignments/ sequences and mappings for 27bp reads
 #   format: chrom, start, end, strand, read_id, mismatches, sequence 
 # densities/ indication of #reads near the base, 25bp fixed window, -1 if unalignable base
 # Allele-specific fragment counts
 #   format: chr start allele1  allele2  # # 
 # Enriched intervals by HMM
 #   BED3
 # Enriched intervals by fixed-size windows
 #   BED3
 # Also, gene expression data
 
 # Track organization:
 # Broad ChIP ES supertrack, with tracks:
 #       - Broad Stem ChIP Seq (read alignments)
 #       - Broad Stem ChIP Sig (density in 25bp windows)
 #       - Broad Stem ChIP Sites (regions from HMM, windowing)
 # Each track has subtracks for different cell types and antibodies
 
 # Also, a track for the expression data: Broad ES
 #      
 
     ssh kkstore04
     cd /cluster/data/mm8/bed
     mkdir -p broadStemChip
     cd broadStemChip/
     wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/
     mv pub/papers/chipseq .
     rm -fr pub
     # original data
     ln -s chipseq lab
     cd lab
 
     ###############
     # Sites track
 
     # HMM Sites -- BED3
     mkdir -p hmmSites
     cd hmmSites
     tar xvfz ../HMMIntervals.tar.gz
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/broadStemChip
 cat > hmmSites.csh << 'EOF'
     foreach f (/hmmSites/HMM_ES_*.txt)
         set b = $f:t
         set ab = `echo $b | perl -wpe 's/HMM_ES_(.+).txt/H3$1me3/'`
         echo $ab
         tail +2 $f | sed 's/^/chr/' | \
                 hgLoadBed mm8 broadStemChipHmmSites${ab}Es stdin
     end
 'EOF'
 # Fix script coloring EOF
  
     csh hmmSites.csh >&! hmmSites.log
     # Loaded 1788 - 19523 elements in 5 tracks
     # H3K{20,27,36,4,9)me3
 
     mkdir -p WindowSites
     cd WindowSites
     tar xvfz ../WindowIntervals.tar.gz
     cd ..
     awk '{print $4}' *K*.txt | sort -n | head -1
 
     # Sites from Window algorithm -- BED3 plus float score
     # min: 2.75, max: 275.50
     # distribution of data values:
      awk '{print $4}' *K*.txt | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
      0.000000 ************************************************************ 38346
      10.000000 ************************** 16385
      20.000000 ************** 9186
      30.000000 ********* 5705
      40.000000 ******* 4607
      50.000000 ****** 3686
      60.000000 **** 2243
      70.000000 ** 1094
      80.000000 * 382
      90.000000  112
      100.000000  31
      110.000000  10
      120.000000  3
      130.000000  2
      140.000000  0
      150.000000  0
      160.000000  0
      170.000000  0
      180.000000  0
      190.000000  1
      200.000000  0
      210.000000  0
      220.000000  2
      230.000000  0
      240.000000  0
      250.000000  0
      260.000000  0
      270.000000  1
     
      # To range score display from 300 to 1000, use:
      #  (x * 2) + 300
 
 mkdir windowSites
 cat > windowSites.csh << 'EOF'
     foreach f (chipseq/windowSites/*.K*.txt)
         set b = $f:t
         set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).txt/H3\u$1\L$2me3/'`
         set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
         tail +2 $f |  awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
         # using kate's version, testing -renameSqlTable option
         /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
             -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
                 broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
     end
 'EOF'
 # Fix script coloring EOF 
     csh windowSites.csh >&! windowSites.log
 
     ###############
     # Signal track
     # indication of #reads near the base, 25bp fixed window, -1 if unalignable base
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/lab/densities
     mkdir -p alignable
     cd alignable
     tar xvfz ../alignable.tar.gz
     cd ../..
 
     # Get a list of the datasets
     mkdir -p signal
     tar tfz chipseq/densities/chr1.tar.gz | \
         perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasets.txt
     # ignore control (whole-cell extract)
     grep -v WCE signal/datasets.txt > signal/subtracks.txt
     wc -l signal/subtracks.txt
         # 18
 
     # Extract datasets from by-chrom packaging
     # Weed out missing data which are represented as -1 values
 # Convert to wiggle
 
 cat > makeWig.csh << 'EOF'
     foreach s (`cat signal/subtracks.txt`)
         set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/H3\u$1\L$2/'`
         set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
         set table = broadStemChipSignal${ab}${cell}
         echo $table
         rm -f signal/$s.wigVar
         foreach f (chipseq/densities/chr*.tar.gz)
             set c = $f:t:r:r
             (echo "fixedStep chrom=$c start=1 step=25 span=25"; \
                 tar xfzO $f $c.$s.txt) | \
             nice fixStepToBedGraph.pl | \
             nice grep -v '\-1$' | \
             nice wigBedToStep stdin stdout >> signal/$s.wigVar
         end
         nice wigEncode signal/$s.wigVar signal/$s.wig signal/$s.wib
     end
 'EOF'
 # Fix script coloring EOF 
 
 # NEWER
 
 cat > makeWig.csh << 'EOF'
     foreach s (`cat signal/subtracks.txt`)
         set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\L$2/'`
         set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
         set table = broadStemChipSignal${ab}${cell}
         echo $table
         rm -f signal/$s.wigVar
         foreach f (chipseq/densities/chr*.tar.gz)
             set c = $f:t:r:r
             echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
             tar xfzO $f $c.$s.txt | \
                 awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
                 grep -v '\-1$' >> signal/$table.wigVar
         end
         cd signal
         nice wigEncode $table.wigVar $table.wig $table.wib
         cd ..
     end
 'EOF'
     # Fix script coloring EOF 
     csh makeWig.csh >&! makeWig.log &
     # check output and cleanup
     cd signal
     gzip *.wigVar
     csh makeWig.csh >&! makeWig.log &
     # check output and cleanup
     cd signal
     gzip *.wigVar
 
     ######## Load wiggles?
     ssh hgwdev
     mkdir /gbdb/mm8/broadStemChip
 
     cd /cluster/data/mm8/bed/broadStemChip
 cat > loadWig.csh << \_EOF_
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/signal
     foreach f (*.wib)
         set wi = $f:t:r
         set wig = $wi.wig
         echo Start: $wig
         echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
         time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
         echo Finished: $wig
      end
 _EOF_
     chmod +x loadWig.csh
 
     time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &
     
     # Try it by hand.
     time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 broadStemChipSignalH3Es broadStemChipSignalH3Es.wig
     
     # Now Try it again.
 cat > loadWig.csh << \_EOF_
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/signal
     foreach f (*.wib)
         set wi = $f:t:r
         set wig = $wi.wig
         time hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
         echo Finished: $wig
      end
 _EOF_
     # Try it again.
     time nice -n +19 ./loadWig.csh >> loadWig.log 2>&1 &
 
 # | broadStemChipSignalH3Es              |
 # | broadStemChipSignalK20Es             |
 # | broadStemChipSignalK27Es             |
 # | broadStemChipSignalK27Mef            |
 # | broadStemChipSignalK27Np             |
 # | broadStemChipSignalK36Es             |
 # | broadStemChipSignalK36Eshyb          |
 # | broadStemChipSignalK36Mef            |
 # | broadStemChipSignalK36Np             |
 # | broadStemChipSignalK4Es              |
 # | broadStemChipSignalK4Eshyb           |
 # | broadStemChipSignalK4Mef             |
 # | broadStemChipSignalK4Np              |
 # | broadStemChipSignalK9Es              |
 # | broadStemChipSignalK9Eshyb           |
 # | broadStemChipSignalK9Mef             |
 # | broadStemChipSignalK9Np              |
 # | broadStemChipSignalRpolEs            |
 
     # Noticed tables badly named, renamed them and corresponding files
     hgsql mm8
     rename table broadStemChipSignalK4Es     to broadStemChipSignalH3K4Es    ;
     rename table broadStemChipSignalK4Eshyb  to broadStemChipSignalH3K4Eshyb ;
     rename table broadStemChipSignalK4Mef    to broadStemChipSignalH3K4Mef   ;
     rename table broadStemChipSignalK4Np     to broadStemChipSignalH3K4Np    ;
     rename table broadStemChipSignalK9Es     to broadStemChipSignalH3K9Es    ;
     rename table broadStemChipSignalK9Eshyb  to broadStemChipSignalH3K9Eshyb ;
     rename table broadStemChipSignalK9Mef    to broadStemChipSignalH3K9Mef   ;
     rename table broadStemChipSignalK9Np     to broadStemChipSignalH3K9Np    ;
     rename table broadStemChipSignalK20Es    to broadStemChipSignalH4K20Es   ;
     rename table broadStemChipSignalK27Es    to broadStemChipSignalH3K27Es   ;
     rename table broadStemChipSignalK27Mef   to broadStemChipSignalH3K27Mef  ;
     rename table broadStemChipSignalK27Np    to broadStemChipSignalH3K27Np   ;
     rename table broadStemChipSignalK36Es    to broadStemChipSignalH3K36Es   ;
     rename table broadStemChipSignalK36Eshyb to broadStemChipSignalH3K36Eshyb;
     rename table broadStemChipSignalK36Mef   to broadStemChipSignalH3K36Mef  ;
     rename table broadStemChipSignalK36Np    to broadStemChipSignalH3K36Np   ;
 
 # | broadStemChipSignalH3K4Es         |
 # | broadStemChipSignalH3K4Eshyb      |
 # | broadStemChipSignalH3K4Mef        |
 # | broadStemChipSignalH3K4Np         |
 
 # | broadStemChipSignalH3K9Es         |
 # | broadStemChipSignalH3K9Eshyb      |
 # | broadStemChipSignalH3K9Mef        |
 # | broadStemChipSignalH3K9Np         |
 
 # | broadStemChipSignalH4K20Es        |
 
 # | broadStemChipSignalH3K27Es        |
 # | broadStemChipSignalH3K27Mef       |
 # | broadStemChipSignalH3K27Np        |
 
 # | broadStemChipSignalH3K36Es        |
 # | broadStemChipSignalH3K36Eshyb     |
 # | broadStemChipSignalH3K36Mef       |
 # | broadStemChipSignalH3K36Np        |
 
 # | broadStemChipSignalH3Es           |
 # | broadStemChipSignalRpolEs         |
 
 ### ### ### Finished Signals 2008-05-08 
 
 
 ######### Alignments
 
     ### Sample from ES.H3.txt.gz
     # chr10   63848447        63848474        -       3084.4.1        0       GAGAGCCAATGGCTAGGCAGGGCATCA
     ### Convert to 
     #chr10  63848447  63848474  3084.4.1  0  -  63848447  63848474  0,255,0  0  GAGAGCCAATGGCTAGGCAGGGCATCA
     # convert to bed-9+   color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments
     mkdir bed
     cd bed
 cat << \_EOF_ > makeBed9PlusFromAlignments.csh
 #!/usr/bin/perl
 # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
 # black->red palette, based on score value.
 
 use warnings;
 use strict;
 
 while (<>) {
     next if (/^track/ || /^\s*\#/);
     chomp;
     my @words = split("\t");
     if (scalar(@words) < 7) {
       @words = split(/\s+/);
       die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
         if (scalar(@words) < 7);
     }
     my @newWordOrder = ("","","","","","","","","","","");
     $newWordOrder[0] = $words[0];  # chr
     $newWordOrder[1] = $words[1];  # beg
     $newWordOrder[2] = $words[2];  # end
     $newWordOrder[3] = $words[4];  # name
     #$newWordOrder[4] = "0";        # score
     $newWordOrder[4] = 1000 - ($words[5] * 100);        # score  0=1000 1=900 2=800 
     $newWordOrder[5] = $words[3];  # strand
     $newWordOrder[6] = $words[1];  # beg
     $newWordOrder[7] = $words[2];  # end
     $newWordOrder[8] = "0,0,0";    # color to be set later
     $newWordOrder[9] = $words[5];  # mismatch
     $newWordOrder[10] = $words[6]; # seq
     
     print join("\t", @newWordOrder) . "\n";
 }
 _EOF_
 
 cat << \_EOF_ > makeColoredBedOnStrand.csh
 #!/usr/bin/perl
 # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
 # black->red palette, based on score value.
 
 use warnings;
 use strict;
 
 # palette consistes of red, green blue
 my @blues  = ("0,0,255","0,0,204","0,0,170");
 my @greens = ("0,255,0","0,187,0","0,136,0");
 
 while (<>) {
     next if (/^track/ || /^\s*\#/);
     chomp;
     my @words = split("\t");
     if (scalar(@words) < 9) {
       @words = split(/\s+/);
       die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
         if (scalar(@words) < 9);
     }
     die "More than 9 mismatches found line $.\n"
         if ($words[9] > 9);
     my $strand = $words[5];
     if ($strand eq '+') {
         if( $words[9] > 2 ) { 
             $words[8] = $blues[2]; # green
         } else {
             $words[8] = $blues[$words[9]]; # green
         }
     } else {
         if( scalar($words[9]) > 2 ) { 
             $words[8] = $greens[2]; # blue
         } else {
             $words[8] = $greens[$words[9]]; # blue
         }
     }
     print join("\t", @words) . "\n";
 }
 _EOF_
 
 cat << \_EOF_ > convertToBed.csh
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     foreach f (../*.txt.gz)
         set root = `echo $f:t:r:r`
         zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
         echo $root.bed.gz done 
     end
 _EOF_
 
 
 chmod +x makeBed9PlusFromAlignments.csh
 chmod +x makeColoredBedOnStrand.csh
 chmod +x convertToBed.csh
 
     zcat ../ES.H3.txt.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh 
     ## How to make bash work ???
     #for f in ../*.txt.gz; do
     #  root=${f##*/}
     #  root=${root%.*}
     #  root=${root%.*}
     #  zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
     #  echo $root.bed.gz done 
     #done
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
     
     # failed because mismatches exceeded 2, so used following to determin max mismatches: 6 in ES.H3
     zcat ../ES.H3.txt.gz | head -100 | awk '{print $6}' | sort -n | uniq -c | wc -l 
     # real    55m8.275s
 
 # Two were not gzipped!
 cat << \_EOF_ > convertTxtToBed.csh
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     foreach f (../ES.*.txt)
         set root = `echo $f:t:r`
         ./makeBed9PlusFromAlignments.csh < $f | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
         echo $root.bed.gz done 
     end
 _EOF_
 chmod +x convertTxtToBed.csh
     time nice -n +19 ./convertTxtToBed.csh >> convert.log 2>&1 &
 
 #  Add comments:
 cat << \_EOF_ > commentBedFiles.csh 
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     set descr1 = `grep Primary ../readme.txt | tr -d "\r"`
     set descr2 = `grep pluripotent ../readme.txt | tr -d "\r"`
     foreach f (ES.*.bed.gz)
         set root = `echo $f:t:r:r`
         set comment = `grep $root ../readme.txt | tr -d "\r"`
         echo "# $comment - ${descr1} ${descr2}" > new.${root}.bed 
         zcat $f >> new.${root}.bed 
         gzip new.${root}.bed 
     end
 _EOF_
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     time nice -n +19 ./commentBedFiles.csh > comment.log 2>&1 &
     
     # Rename to match other identifiers?
     # | broadStemChipHmmSitesH3K20me3Es  |
     # | broadStemChipHmmSitesH3K27me3Es  |
     # | broadStemChipHmmSitesH3K36me3Es  |
     # | broadStemChipHmmSitesH3K4me3Es   |
     # | broadStemChipHmmSitesH3K9me3Es   |
     # | broadStemChipWinSitesH3K27me3Es  |
     # | broadStemChipWinSitesH3K27me3Mef |
     # | broadStemChipWinSitesH3K27me3Np  |
     # | broadStemChipWinSitesH3K4me3Es   |
     # | broadStemChipWinSitesH3K4me3Mef  |
     # | broadStemChipWinSitesH3K4me3Np   |
     # | broadStemChipWinSitesH3K9me3Es   |
     # | broadStemChipWinSitesH3K9me3Mef  |
     # | broadStemChipWinSitesH3K9me3Np   |
 
      zcat new.ES.K9.bed.gz | head -1 | awk '{ print $5 }'
      head -1 new.*.bed | awk '{ print $5 }'
      for f in new.ES.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Es"}'; done
      for f in new.ES.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceEs"}'; done
      for f in new.ES.H3.*.gz; do zcat $f | head -1 | awk '{ print $2,"H3panEs"}'; done
      for f in new.ES.R*.gz; do zcat $f | head -1 | awk '{ print $2,"RPolEs"}'; done
      for f in new.ESHyb.*.gz; do zcat $f | head -1 | awk '{ print $2,"ES" $6 "EsHyb"}'; done
      for f in new.MEF.K*.gz; do zcat $f | head -1 | awk '{ print $2,$4 "Mef"}'; done
      for f in new.MEF.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceMef"}'; done
      for f in new.NP.K*.gz; do zcat $f | head -1 | awk '{ print $2,$5 "Np"}'; done
      for f in new.NP.WCE.*.gz; do zcat $f | head -1 | awk '{ print $2,"WceNp"}'; done
      
 mv new.ES.K20.bed.gz H4K20Me3Es.bed.gz
 mv new.ES.K27.bed.gz H3K27Me3Es.bed.gz
 mv new.ES.K36.bed.gz H3K36Me3Es.bed.gz
 mv new.ES.K4.bed.gz H3K4Me3Es.bed.gz
 mv new.ES.K9.bed.gz H3K9Me3Es.bed.gz
 mv new.ES.WCE.bed.gz WceEs.bed.gz
 mv new.ES.H3.bed.gz H3panEs.bed.gz
 mv new.ES.RPol.bed.gz RPolEs.bed.gz
 mv new.ESHyb.K36.bed.gz ESH3K36Me3EsHyb.bed.gz
 mv new.ESHyb.K4.bed.gz ESH3K4Me3EsHyb.bed.gz
 mv new.ESHyb.K9.bed.gz ESH3K9Me3EsHyb.bed.gz
 mv new.MEF.K27.bed.gz H3K27Me3Mef.bed.gz
 mv new.MEF.K36.bed.gz H3K36Me3Mef.bed.gz
 mv new.MEF.K4.bed.gz H3K4Me3Mef.bed.gz
 mv new.MEF.K9.bed.gz H3K9Me3Mef.bed.gz
 mv new.MEF.WCE.bed.gz WceMef.bed.gz
 mv new.NP.K27.bed.gz H3K27Me3Np.bed.gz
 mv new.NP.K36.bed.gz H3K36Me3Np.bed.gz
 mv new.NP.K4.bed.gz H3K4Me3Np.bed.gz
 mv new.NP.K9.bed.gz H3K9Me3Np.bed.gz
 mv new.NP.WCE.bed.gz WceNp.bed.gz
 
 
         #hgLoadBed mm8 broadStemChipAlign${root} ${f}
 
     time nice -n +19 hgLoadBed mm8 broadStemChipAlignmentsWceEs WceEs.bed.gz &
     
     ### Failed!  All that work to put a nice comment in the bed file, and hgLoadBed does not handle it!
     ### Fixed this in hgLoadBed.c
     
     
 cat << \_EOF_ > myBedTbl.sql
     CREATE TABLE myBedTbl (
       bin smallint unsigned not null,
       chrom varchar(255) not null,
       chromStart int unsigned not null,
       chromEnd int unsigned not null,
       name varchar(255) not null,
       score int unsigned not null,
       strand char(1) not null,
       thickStart int unsigned not null,
       thickEnd int unsigned not null,
       reserved int unsigned  not null,
       mismatchCount int unsigned not null,
       seq varchar(255) not null,
     #Indices
       INDEX(name(16)),
       INDEX(chrom(5),bin)
     )
 _EOF_
 
 cat << \_EOF_ > loadBedFiles.csh
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/alignments/bed
     foreach f (*.bed.gz)
         set root = `echo $f:t:r:r`
         ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 broadStemChipAlignments${root} ${f} 
         echo broadStemChipAlignments${root} ${f} done 
     end
 _EOF_
 chmod +x loadBedFiles.csh
     time nice -n +19 ./loadBedFiles.csh &
 real    62m46.504s
 
     # Noticed 3 tables badly named, renamed them and corresponding files
     hgsql mm8
     rename table broadStemChipAlignmentsESH3K36Me3EsHyb to broadStemChipAlignmentsH3K36Me3EsHyb;
     rename table broadStemChipAlignmentsESH3K4Me3EsHyb to broadStemChipAlignmentsH3K4Me3EsHyb;
     rename table broadStemChipAlignmentsESH3K9Me3EsHyb to broadStemChipAlignmentsH3K9Me3EsHyb;
     
     # edited trackDb.broadStem.ra
     
 broadStemChipAlignmentsH3K4Me3Es
 broadStemChipAlignmentsH3K4Me3Mef
 broadStemChipAlignmentsH3K4Me3Np
 
 broadStemChipAlignmentsH3K9Me3Es
 broadStemChipAlignmentsH3K9Me3Mef
 broadStemChipAlignmentsH3K9Me3Np
 
 broadStemChipAlignmentsH4K20Me3Es
 
 broadStemChipAlignmentsH3K27Me3Es
 broadStemChipAlignmentsH3K27Me3Mef
 broadStemChipAlignmentsH3K27Me3Np
 
 broadStemChipAlignmentsH3K36Me3Es
 broadStemChipAlignmentsH3K36Me3Mef
 broadStemChipAlignmentsH3K36Me3Np
 
 broadStemChipAlignmentsH3K9Me3EsHyb
 broadStemChipAlignmentsH3K36Me3EsHyb
 broadStemChipAlignmentsH3K4Me3EsHyb
 
 broadStemChipAlignmentsWceEs
 broadStemChipAlignmentsWceMef
 broadStemChipAlignmentsWceNp
 
 broadStemChipAlignmentsRPolEs
 broadStemChipAlignmentsH3panEs
 
 ### ### ### Finished Alignments 2008-04-29 
 
 ### ### ### Edited mouse/mmm8/trackDb.broadStem.ra to include new broadChromatinChIPSeq 
 ### ### ### track with 53 subtracts covering sites (HMM, Windowing), siganl & alignments
 ### ### ### for ES, MAF, NP, ES_hybrid cell lines
 ### ### ### and H3K4me3 H3K9me3 H4K20me3 H3K27me3 H3K36me3 antibodies
 ### ### ### and WCE, RPOL-II and pan-H3 controls  
 
 ############################################################################
 # Adding more tracks from Broad (Meissner2008)
 # (Start 2008-7-14 Tim  Done: 2008-07-18)
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/chipseq
     mkdir -p Meissner2008
     cd Meissner2008/
     wget -r ftp://ftp.broad.mit.edu/pub/papers/chipseq/Meissner2008/
     mv pub/papers/chipseq/Meissner2008 .
     rm -fr pub
     # original data
     ln -s chipseq lab
     cd lab
 
     ###############
     # Sites track
     mkdir windowSites/Meissner2008
     cd windowSites/Meissner2008
     tar xvfz ../../Meissner2008/WindowIntervals.tar.gz
     awk '{print $4}' *.sites | sort -n | head -1
 
     # Sites from Window algorithm -- BED3 plus float score
     # min: 2.50, max: 275.50
     # distribution of data values:
      awk '{print $4}' *.sites | sort | textHistogram -binSize=10 maxBinCount=30 -real stdin
     # 0.000000 ************************************************************ 155307
     # 10.000000 **************** 42020
     # 20.000000 ****** 14576
     # 30.000000 **** 10408
     # 40.000000 ** 5717
     # 50.000000 * 2299
     # 60.000000  718
     # 70.000000  232
     # 80.000000  60
     # 90.000000  15
     # 100.000000  3
     # 110.000000  6
     # 120.000000  1
     # 130.000000  1
     # 140.000000  1
     mv Brain.H3K27me3.sites ../Brain.K27me3.sites
     mv Brain.H3K4me2.sites  ../Brain.K4me2.sites
     mv Brain.H3K4me3.sites  ../Brain.K4me3.sites
     mv ES.H3K4me1.sites     ../ES.K4me1.sites
     mv ES.H3K4me2.sites     ../ES.K4me2.sites
     mv NP.H3K4me1.sites     ../NP.K4me1.sites
     mv NP.H3K4me2.sites     ../NP.K4me2.sites
     mv readme.txt ../readme.Meissner2008.txt
     cd ..
     rmdir Meissner2008/
     # Continue to distinguish by .sites
     # Brain.K27me3.sites  ES.K27.txt      ES.K4me2.sites  MEF.K4.txt  NP.K4.txt       NP.K9.txt
     # Brain.K4me2.sites   ES.K4.txt       ES.K9.txt       MEF.K9.txt  NP.K4me1.sites  readme.Meissner2008.txt
     # Brain.K4me3.sites   ES.K4me1.sites  MEF.K27.txt     NP.K27.txt  NP.K4me2.sites  readme.txt
     
      # To range score display from 300 to 1000, use THE SAME CONVERSION AS for the whole group:
      #  (x * 2) + 300
 
     cd /cluster/data/mm8/bed/broadStemChip
 mkdir windowSites
 cat > windowSites.Meissner2008.csh << \_EOF_
     foreach f (chipseq/windowSites/*.sites)
         set b = $f:t
         set ab = `echo $b | perl -wpe 's/\w+.(\w)(\w+).sites/H3$1\L$2/'`
         set cell = `echo $b | perl -wpe 's/(\w)(\w+).*/\u$1\L$2/'`
         echo $cell $ab $b
         tail +2 $f |  awk '{printf "%s\t%d\t%d\t \t%d\t%s\n", $1, $2, $3, ($4 * 2) + 300, $4}' > windowSites/$cell.$ab.tab
         # using kate's version, testing -renameSqlTable option
         /cluster/home/kate/bin/x86_64/hgLoadBed mm8 -tab -noNameIx -renameSqlTable \
             -sqlTable=/cluster/bin/sqlCreate/bed5FloatScore.sql \
                 broadStemChipWinSites${ab}${cell} windowSites/$cell.$ab.tab
     end
 _EOF_
 # Fix script coloring EOF 
     chmod +x windowSites.Meissner2008.csh 
     csh windowSites.Meissner2008.csh > windowSites.Meissner2008.log 2>&1
     
     ###############
     # Signal track
     # indication of #reads near the base, 25bp fixed window, -1 if unalignable base
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/densities
     mkdir -p alignable
     cd alignable
     foreach f (../*.tar.gz)
         tar xvfz $f
     end
 
     cd ../..
 
     # Get a list of the datasets
     #mkdir -p signal
     tar tfz chipseq/Meissner2008/densities/chr1.tar.gz | \
         perl -wpe 's/chr\w.(\w+.\w+).txt/$1/' > signal/datasetsMeissner2008.txt
     # ignore control (whole-cell extract)
     grep -v WCE signal/datasetsMeissner2008.txt > signal/subtracksMeissner2008.txt
     wc -l signal/subtracksMeissner2008.txt
         # 7
 
     # Extract datasets from by-chrom packaging
     # Weed out missing data which are represented as -1 values
 # Convert to wiggle
 
 cat > makeWigMeissner2008.csh << \_EOF_
     foreach s (`cat signal/subtracksMeissner2008.txt`)
         set ab = `echo $s | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
         set cell = `echo $s | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
         set table = broadStemChipSignal${ab}${cell}
         echo $table $s
         rm -f signal/$table.wigVar
         foreach f (chipseq/Meissner2008/densities/chr*.tar.gz)
             set c = $f:t:r:r
             echo "variableStep chrom=$c span=25" >> signal/$table.wigVar
             tar xfzO $f $c.$s.txt | \
                 awk 'BEGIN {pos = 1} {print pos, $1; pos += 25}' | \
                 grep -v '\-1$' >> signal/$table.wigVar
         end
         cd signal
         nice wigEncode $table.wigVar $table.wig $table.wib
         cd ..
     end
 _EOF_
     # Fix script coloring EOF
     chmod +x makeWigMeissner2008.csh 
     csh makeWigMeissner2008.csh > makeWigMeissner2008.log 2>&1 &
     # check output and cleanup
     cd signal
     gzip *.wigVar
 
     ######## Load wiggles?
     ssh hgwdev
     #mkdir /gbdb/mm8/broadStemChip
 
     cd /cluster/data/mm8/bed/broadStemChip
 cat > loadWigMeissner2008.csh << \_EOF_
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/signal
     foreach f (*H3K*me*.wib)
         set wi = $f:t:r
         set wig = $wi.wig
         echo Start: $wig
         echo "ln -s `pwd`/$f /gbdb/mm8/broadStemChip/$wi.wib"
         hgLoadWiggle -pathPrefix=/gbdb/mm8/broadStemChip mm8 $wi $wig
         echo Finished: $wig
      end
 _EOF_
     chmod +x loadWigMeissner2008.csh
     ./loadWigMeissner2008.csh
     time nice -n +19 ./loadWigMeissner2008.csh >> loadWigMeissner2008.log 2>&1 &
     
     # Noticed tables badly named, renamed them and corresponding files
     #     hgsql mm8
     #     rename table broadStemChipSignalH3Es          to broadStemChipSignalH3panEs      
     #     rename table broadStemChipSignalH3K27Es       to broadStemChipSignalH3K27me3Es   
     #     rename table broadStemChipSignalH3K27Mef      to broadStemChipSignalH3K27me3Mef  
     #     rename table broadStemChipSignalH3K27Np       to broadStemChipSignalH3K27me3Np   
     #     rename table broadStemChipSignalH3K36Es       to broadStemChipSignalH3K36me3Es   
     #     rename table broadStemChipSignalH3K36EsHyb    to broadStemChipSignalH3K36Esme3Hyb
     #     rename table broadStemChipSignalH3K36Mef      to broadStemChipSignalH3K36me3Mef  
     #     rename table broadStemChipSignalH3K36Np       to broadStemChipSignalH3K36me3Np   
     #     rename table broadStemChipSignalH3K4Es        to broadStemChipSignalH3K4me3Es    
     #     rename table broadStemChipSignalH3K4EsHyb     to broadStemChipSignalH3K4Esme3Hyb 
     #     rename table broadStemChipSignalH3K4Mef       to broadStemChipSignalH3K4me3Mef   
     #     rename table broadStemChipSignalH3K4Np        to broadStemChipSignalH3K4me3Np    
     #     rename table broadStemChipSignalH3K9Es        to broadStemChipSignalH3K9me3Es    
     #     rename table broadStemChipSignalH3K9EsHyb     to broadStemChipSignalH3K9Esme3Hyb 
     #     rename table broadStemChipSignalH3K9Mef       to broadStemChipSignalH3K9me3Mef   
     #     rename table broadStemChipSignalH3K9Np        to broadStemChipSignalH3K9me3Np    
     #     rename table broadStemChipSignalH4K20Es       to broadStemChipSignalH4K20me3Es   
 
 ######### Alignments
 
     ### Sample from Brain.H3K27me3.aligned.gz
     #chr10   63848447        63848474        -       3084.4.1        0       GAGAGCCAATGGCTAGGCAGGGCATCA
     ### Convert to 
     #chr10  63848447  63848474  3084.4.1  0  -  63848447  63848474  0,255,0  0  GAGAGCCAATGGCTAGGCAGGGCATCA
     # convert to bed-9+   color at 9, mismatch at 10 and seq at 11; grabbed some example code from encodeHg17.txt PET
 
     ssh hgwdev
     cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments
     mkdir bed
     cd bed
     
     cp lab/alignments/bed/make* lab/Meissner2008/alignments/bed
     # cat << \_EOF_ > makeBed9PlusFromAlignments.csh
     # #!/usr/bin/perl
     # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
     # # black->red palette, based on score value.
     # 
     # use warnings;
     # use strict;
     # 
     # while (<>) {
     #     next if (/^track/ || /^\s*\#/);
     #     chomp;
     #     my @words = split("\t");
     #     if (scalar(@words) < 7) {
     #       @words = split(/\s+/);
     #       die "Expecting at least 7 tab-sep fields but got fewer, line $.\n"
     #         if (scalar(@words) < 7);
     #     }
     #     my @newWordOrder = ("","","","","","","","","","","");
     #     $newWordOrder[0] = $words[0];  # chr
     #     $newWordOrder[1] = $words[1];  # beg
     #     $newWordOrder[2] = $words[2];  # end
     #     $newWordOrder[3] = $words[4];  # name
     #     #$newWordOrder[4] = "0";        # score
     #     $newWordOrder[4] = 1000 - ($words[5] * 100);        # score  0=1000 1=900 2=800 
     #     $newWordOrder[5] = $words[3];  # strand
     #     $newWordOrder[6] = $words[1];  # beg
     #     $newWordOrder[7] = $words[2];  # end
     #     $newWordOrder[8] = "0,0,0";    # color to be set later
     #     $newWordOrder[9] = $words[5];  # mismatch
     #     $newWordOrder[10] = $words[6]; # seq
     #     
     #     print join("\t", @newWordOrder) . "\n";
     # }
     # _EOF_
     # 
     # cat << \_EOF_ > makeColoredBedOnStrand.csh
     # #!/usr/bin/perl
     # # replace "reserved" field of BED >=9 fields with RGB value from 8-scale 
     # # black->red palette, based on score value.
     # 
     # use warnings;
     # use strict;
     # 
     # # palette consistes of red, green blue
     # my @blues  = ("0,0,255","0,0,204","0,0,170");
     # my @greens = ("0,255,0","0,187,0","0,136,0");
     # 
     # while (<>) {
     #     next if (/^track/ || /^\s*\#/);
     #     chomp;
     #     my @words = split("\t");
     #     if (scalar(@words) < 9) {
     #       @words = split(/\s+/);
     #       die "Expecting at least 9 tab-sep fields but got fewer, line $.\n"
     #         if (scalar(@words) < 9);
     #     }
     #     die "More than 9 mismatches found line $.\n"
     #         if ($words[9] > 9);
     #     my $strand = $words[5];
     #     if ($strand eq '+') {
     #         if( $words[9] > 2 ) { 
     #             $words[8] = $blues[2]; # green
     #         } else {
     #             $words[8] = $blues[$words[9]]; # green
     #         }
     #     } else {
     #         if( scalar($words[9]) > 2 ) { 
     #             $words[8] = $greens[2]; # blue
     #         } else {
     #             $words[8] = $greens[$words[9]]; # blue
     #         }
     #     }
     #     print join("\t", @words) . "\n";
     # }
     # _EOF_
     # chmod +x makeBed9PlusFromAlignments.csh
     # chmod +x makeColoredBedOnStrand.csh
 
 cat << \_EOF_ > convertToBed.csh
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
     foreach f (../*.aligned.gz)
         set root = `echo $f:t:r:r`
         zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
         echo $root.bed.gz done 
     end
 _EOF_
 chmod +x convertToBed.csh
 
     zcat ../Brain.H3K27me3.aligned.gz | head | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh
     # chr14   12537326        12537362        205CY.7.1       1000    -       12537326        125373620,255,0 0       GGGATATGGACTGAAATAATTAGGAAAGAAATAACT 
     ## How to make bash work ???
     #for f in ../*.txt.gz; do
     #  root=${f##*/}
     #  root=${root%.*}
     #  root=${root%.*}
     #  zcat $f | ./makeBed9PlusFromAlignments.csh | ./makeColoredBedOnStrand.csh | gzip > $root.bed.gz
     #  echo $root.bed.gz done 
     #done
 
     ssh kkstore04
     cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
     time nice -n +19 ./convertToBed.csh > convert.log 2>&1 &
     # real    25m22.762s
     # Brain.H3K27me3.bed.gz done
     # Brain.H3K4me2.bed.gz done
     # Brain.H3K4me3.bed.gz done
     # ES.H3K4me1.bed.gz done
     # ES.H3K4me2.bed.gz done
     # NP.H3K4me1.bed.gz done
     # NP.H3K4me2.bed.gz done
     # zcat Brain.H3K27me3.bed.gz | head -2
     # chr14   12537326        12537362        205CY.7.1       1000    -       1253732612537362        0,255,0 0       GGGATATGGACTGAAATAATTAGGAAAGAAATAACT
     # chr2    70236933        70236969        205CY.7.2       900     +       7023693370236969        0,0,204 1       GAATCCTTGAACATATTTATAATCATTCTTTTTAAT
     # Compared to: zcat ../../../alignments/bed/ES.K20.bed.gz | head -2
     # chr8    77978889        77978916        3080.2.1        1000    +       7797888977978916        0,0,255 0       GAAGGAAATCAGTCTTTGTTGAGCAGT
     # chr12   38598403        38598430        3080.2.2        1000    +       3859840338598430        0,0,255 0       GATATTTCATTCCTTGGAGAAGGGTAA
     
 cp ../../../alignments/bed/myBedTbl.sql .
     # cat << \_EOF_ > myBedTbl.sql
     #     CREATE TABLE myBedTbl (
     #       bin smallint unsigned not null,
     #       chrom varchar(255) not null,
     #       chromStart int unsigned not null,
     #       chromEnd int unsigned not null,
     #       name varchar(255) not null,
     #       score int unsigned not null,
     #       strand char(1) not null,
     #       thickStart int unsigned not null,
     #       thickEnd int unsigned not null,
     #       reserved int unsigned  not null,
     #       mismatchCount int unsigned not null,
     #       seq varchar(255) not null,
     #     #Indices
     #       INDEX(name(16)),
     #       INDEX(chrom(5),bin)
     #     )
     # _EOF_
 
 cat << \_EOF_ > loadBedFiles.csh
 #!/bin/csh -fe
     cd /cluster/data/mm8/bed/broadStemChip/lab/Meissner2008/alignments/bed
     foreach f (*.bed.gz)
         set root = `echo $f:t:r:r`
         set ab = `echo $root | perl -wpe 's/\w+.(\w)(\w+)/\u$1\u$2/'`
         set cell = `echo $root | perl -wpe 's/(\w)(\w+).\w+/\u$1\L$2/'`
         set table = broadStemChipAlignments${ab}${cell}
         ~/bin/x86_64/hgLoadBed -sqlTable=myBedTbl.sql -renameSqlTable mm8 ${table} ${f} 
         echo ${table} ${f} done 
     end
 _EOF_
 chmod +x loadBedFiles.csh
     time nice -n +19 ./loadBedFiles.csh > load.log 2>&1 &
     real    28m9.939s
     # broadStemChipAlignmentsH3K27me3Brain
     # broadStemChipAlignmentsH3K4me1Es
     # broadStemChipAlignmentsH3K4me1Np
     # broadStemChipAlignmentsH3K4me2Brain
     # broadStemChipAlignmentsH3K4me2Es
     # broadStemChipAlignmentsH3K4me2Np
     # broadStemChipAlignmentsH3K4me3Brain
     #   
     # broadStemChipWinSitesH3K27me3Brain
     # broadStemChipWinSitesH3K4me1Es
     # broadStemChipWinSitesH3K4me1Np
     # broadStemChipWinSitesH3K4me2Brain
     # broadStemChipWinSitesH3K4me2Es
     # broadStemChipWinSitesH3K4me2Np
     # broadStemChipWinSitesH3K4me3Brain
     #   
     # broadStemChipSignalH3K27me3Brain
     # broadStemChipSignalH3K4me1Es
     # broadStemChipSignalH3K4me1Np
     # broadStemChipSignalH3K4me2Brain
     # broadStemChipSignalH3K4me2Es
     # broadStemChipSignalH3K4me2Np
     # broadStemChipSignalH3K4me3Brain
 
     # edited trackDb.broadStem.ra
 
 ############################################################################
 #  mm8 - Mouse - Ensembl Genes (DONE - 2008-03-06 - hiram)
     ssh kkstore04
     cd /cluster/data/mm8
     cat << '_EOF_' > mm8.ensGene.ra
 # required db variable
 db mm8
 # optional liftRandoms yes/no or absent
 liftRandoms yes
 # optional nameTranslation, the sed command that will transform
 #       Ensemble names to UCSC names.  With quotes just to make sure.
 nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
 # optionally update the knownToEnsembl table after ensGene updated
 knownToEnsembl yes
 '_EOF_'
 #  << happy emacs
 
     doEnsGeneUpdate.pl -ensVersion=46 mm8.ensGene.ra
     ssh hgwdev
     cd /cluster/data/mm8/bed/ensGene.46
     featureBits mm8 ensGene
     # 56654064 bases of 2567283971 (2.207%) in intersection
 ############################################################################
 # Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
     # import ccds database as described in ccds.txt
     set db=mm8
     set ncbiBld=36.1
     # create and load ccdsGene and ccdsInfo tables from imported database
     /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene
 
     # ccdsKgMap
     /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap
 
     checkTableCoords ${db} -verbose=2 ccdsGene
     # update all.jointer to include ${db} in ccdsDb
     joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
     # request push of 
         ccdsGene
         ccdsInfo
         ccdsKgMap
     # << emacs
 ############################################################################
 # AGILENT CGH PROBES (Done 2008-05-13, Andy)
 # (see hg18.txt)
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 
 #############################################################################
 # MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)  
 # (to build the affyExonTissues track, see the steps outlined in hg18.txt)
 #############################################################################
 
 ########################################################################
 ## AFFY ALL EXON PROBESETS (MM8) (DONE 2009-01-29, Andy)
 ssh hgwdev
 mkdir /hive/data/genomes/mm8/bed/affyAllExonProbes
 cd /hive/data/genomes/mm8/bed/affyAllExonProbes
 ln -s MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.csv
 wget --load-cookies affycookies.txt http://www.affymetrix.com/Auth/analysis/downloads/na20/exon/MoEx-1_0-st-v1.r2.dt1.mm8.zip
 sed '1,12d' mm8.csv | tr ',' '\t' | cut -f 1,5-8,12 \
   | sed 's/\"//g' | grep -v "\-\-\-" \
   | awk 'BEGIN{FS="\t";OFS="\t";}{if ($6 == "core") score = 1000; else if ($6 == "extended") score = 700; else if ($6 == "full") score = 300; else score = 100; name = $1"|"$6; print $2, $4-1, $5, name, score, $3}' \
   | bedSort stdin mm8.bed
 hgLoadBed mm8 affyAllExonProbes mm8.bed
 rm MoEx-1_0-st-v1.r2.dt1.mm8.{cor,ext,full,zip}* bed.tab affycookies.txt mm8.csv
 gzip MoEx-1_0-st-v1.r2.dt1.mm8.csv mm8.bed
 
 ################################################
 # AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
 update genbank.conf:
 mm8.upstreamGeneTbl = refGene
 mm8.upstreamMaf = multiz17way /hive/data/genomes/mm8/bed/multiz17way/species.lst
 
 #############################################################################
 # MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08)
     ssh hgwdev
     mkdir /cluster/data/mm8/bed/mrnaPcr
     cd /cluster/data/mm8/bed/mrnaPcr
     hgsql mm8 -NBe 'select * from knownGene' > knownGene.gp
     genePredToBed knownGene.gp > ucscGenes.bed
     hgsql mm8 -NBe 'select kgId,geneSymbol from kgXref' \
     | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
       > idSub.txt
     subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
     sequenceForBed -keepName -db=mm8 -bedIn=ucscGenesIdSubbed.bed \
       -fastaOut=stdout \
     | faToTwoBit -ignoreDups stdin kgTargetSeq.2bit
     cut -f 1-10 knownGene.gp \
     | genePredToFakePsl mm8 stdin kgTargetAli.psl /dev/null
 
     # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
     cd /cluster/data/mm8/bed/mrnaPcr
     hgLoadPsl mm8 kgTargetAli.psl
     mkdir /gbdb/mm8/targetDb
     ln -s /cluster/data/mm8/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm8/targetDb/
 
     # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
     # /gbdb/mm8/targetDb/kgTargetSeq.2bit .
 
     ssh hgwdev
     # Add records to hgcentraltest blatServers and targetDb:
     hgsql hgcentraltest -e \
       'INSERT into blatServers values ("mm8Kg", "blat13", 17803, 0, 1);'
     hgsql hgcentraltest -e \
       'INSERT into targetDb values("mm8Kg", "UCSC Genes", \
          "mm8", "kgTargetAli", "", "", \
          "/gbdb/mm8/targetDb/kgTargetSeq.2bit", 1, now(), "");'
 
 
 #############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
 
 see doc/builds.txt for specific details.
 ############################################################################