src/hg/makeDb/doc/anoCar1.txt 1.21

1.21 2009/07/21 21:01:40 markd
added transmap update blurb
Index: src/hg/makeDb/doc/anoCar1.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/anoCar1.txt,v
retrieving revision 1.20
retrieving revision 1.21
diff -b -B -U 1000000 -r1.20 -r1.21
--- src/hg/makeDb/doc/anoCar1.txt	1 Jul 2008 16:52:59 -0000	1.20
+++ src/hg/makeDb/doc/anoCar1.txt	21 Jul 2009 21:01:40 -0000	1.21
@@ -1,853 +1,862 @@
 # for emacs: -*- mode: sh; -*-
 
 # Lizard - Anolis carolinensis - Broad Institute 1.0
 
 #########################################################################
 # DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram)
     ssh kkstore05
     mkdir /cluster/store12/anoCar1
     ln -s /cluster/store12/anoCar1 /cluster/data/anoCar1
     mkdir /cluster/data/anoCar1/downloads
     cd /cluster/data/anoCar1/downloads
     foreach f (assembly.agp \
                BasicAssemblyOneLiner.out \
                ForDistribution.command \
                assembly.bases.gz \
                assembly.links \
                assembly.quals.gz \
                source)
       wget --timestamping \
         ftp://ftp.broad.mit.edu/pub/assemblies/reptiles/lizard/AnoCar1.0/$f
     end
     faSize assembly.bases.gz 
 # 1741478929 bases (0 N's 1741478929 real 1741478929 upper 0 lower)
 #	in 50470 sequences in 1 files
     #	Discovered later that this quals file needs to be lifted
     qaToQac assembly.quals.gz stdout \
 	| qacAgpLift assembly.agp stdin scaffold.lifted.qac
 
     ##	Calculate N50
     #	Find total length in sequence
     sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | tail -1
     #	7233 1781602899 340 scaffold_7232
     #	half of 1781602899 is 890801449
     #	run this again, scanning the list until the sum reaches 890801449
     sort -k2nr chrom.sizes | awk '{sum+=$2;print NR,sum,$2,$1}' | less
     #	204 889215106 2440512 scaffold_201
     #	205 891654231 2439125 scaffold_205
 
 #########################################################################
 ## Create .ra file and run makeGenomeDb.pl
     ssh hgwdev
     cd /cluster/data/anoCar1
     cat << '_EOF_' >anoCar1.config.ra
 # Config parameters for makeGenomeDb.pl:
 db anoCar1
 clade vertebrate
 genomeCladePriority 66
 scientificName Anolis carolinensis
 commonName Lizard
 assemblyDate Jan. 2007
 assemblyLabel Broad Institute AnoCar (1.0)
 orderKey 440
 mitoAcc none
 fastaFiles /cluster/data/anoCar1/downloads/assembly.bases.gz
 agpFiles /cluster/data/anoCar1/downloads/assembly.agp
 qualFiles /cluster/data/anoCar1/downloads/scaffold.lifted.qac
 dbDbSpeciesDir lizard
 '_EOF_'
 
     time makeGenomeDb.pl -verbose=2 anoCar1.config.ra > makeGenomeDb.out 2>&1
     #	broken down during the quals step since assembly.quals.gz needed
     #	to be lifted.  do the qaToQac | qacAgpLift sequence, fixup the
     #	specification above for qualFiles, and finish off the quals loading:
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/qual
     qacToWig -fixed ../../downloads/scaffold.lifted.qac stdout \
 	| wigEncode stdin qual.wig qual.wib
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/qual
     time nice -n +19 hgLoadWiggle \
 	-pathPrefix=/gbdb/anoCar1/wib anoCar1 quality qual.wig
 
     ##	continue
     ssh hgwdev
     cd /cluster/data/anoCar1
     time makeGenomeDb.pl -verbose=2 -continue=dbDb anoCar1.config.ra \
 	> makeDbDb.out 2>&1
 
     ## better orderKey to get Lizard between frog and fish
     hgsql -e 'update dbDb set orderKey="440" where name="anoCar1";' \
 	hgcentraltest
     ## fixup that number in the .ra file as mentioned above, was 375
 ##########################################################################
 ## Photograph - permission to use obtained from R. Steven Rainwater
 ##		(DONE - 2007-02-16 - Hiram)
     ## Fetch photo from:
     wget --timestamping \
 	"http://rainwaterreptileranch.org/steve/photos/herps/anole2.jpeg" \
 	-O R.Steven.Rainwater.Anole.Lizard.jpg
     convert -quality 80 -sharpen 0 -crop "168x272+264+37" \
 	R.Steven.Rainwater.Anole.Lizard.jpg Anolis_carolinensis.jpg
 
 ################################################
 ## WINDOWMASKER (Working - 2007-02-16 - Hiram)
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/
     time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl anoCar1 \
       -workhorse=kolossus > wmRun.log 2>&1 &
     #	real    172m58.813s
 
     # Save the log
     mv wmRun.log WindowMasker.2007-02-16
 
     # Masking statistics
     cd WindowMasker.2007-02-18
     twoBitToFa anoCar1.wmsk.2bit stdout | faSize stdin
     #	1781602899 bases (40123970 N's 1741478929 real
     #	1009685313 upper 731793616 lower) in 7233 sequences in 1 files
     twoBitToFa anoCar1.wmsk.sdust.2bit stdout | faSize stdin
     #	1781602899 bases (40123970 N's 1741478929 real
     #	1000477327 upper 741001602 lower) in 7233 sequences in 1 files
 
     ssh hgwdev
     hgLoadBed -strict anoCar1 windowmaskerSdust windowmasker.sdust.bed.gz
     #	Loaded 8354004 elements of size 3
 
     #	why does featureBits show more bases masked than what faSize
     #	measured as lower case ?  Because this counts the masked sequence
     #	in the gaps, and the faSize doesn't have the gaps.
     time nice -n +19 featureBits anoCar1 windowmaskerSdust
     #	781125572 bases of 1741478929 (44.854%) in intersection
 
     time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust
     #	781125572 bases of 1781602899 (43.844%) in intersection
 
     #	Curiously, WM overlaps gaps ?
     time nice -n +19 featureBits -countGaps anoCar1 windowmaskerSdust gap
     #	40123970 bases of 1781602899 (2.252%) in intersection
     #	741001602 + 40123970 == 781125572 
 
 #########################################################################
 # SIMPLE REPEATS (TRF) (DONE 2007-02-16 - Hiram)
 #                      (dropped chromEnd index 2007-05-10 - kuhn)
     ssh kolossus
     mkdir /cluster/data/anoCar1/bed/simpleRepeat
     cd /cluster/data/anoCar1/bed/simpleRepeat
     time nice -n 19 twoBitToFa ../../anoCar1.unmasked.2bit stdout \
 	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
 	    -bedAt=simpleRepeat.bed -tempDir=/tmp > trf.log 2>&1 &
     #	real    164m34.988s
     #	user    159m32.859s
     #	sys     4m31.649s
 
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/simpleRepeat
     # Make a filtered version for sequence masking:
     awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed
     splitFileByColumn trfMask.bed trfMaskChrom
 
     # Load unfiltered repeats into the database:
     ssh hgwdev
     nice -n +19 hgLoadBed anoCar1 simpleRepeat \
       /cluster/data/anoCar1/bed/simpleRepeat/simpleRepeat.bed \
       -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
     #	Loaded 569322 elements of size 16
     nice -n +19 featureBits anoCar1 simpleRepeat
     #	50290171 bases of 1741478929 (2.888%) in intersection
 
 #########################################################################
 ## Add TRF mask to WindowMasker masked sequence, and fixup the bogus
 ##	window masked N's  (DONE - 2007-02-17 - Hiram)
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/WindowMasker.2007-02-16
     ##	Curious, twoBitMask would not accept stdout for its output 2bit
     twoBitMask anoCar1.wmsk.sdust.2bit \
       -add ../simpleRepeat/trfMask.bed tmp.2bit
     twoBitToFa tmp.2bit stdout \
 	| sed -e "s/n/N/g" | faToTwoBit stdin anoCar1.2bit
     ## check it:
     twoBitToFa anoCar1.2bit stdout | faSize stdin
     #	1781602899 bases (40123970 N's 1741478929 real
     #	1000242640 upper 741236289 lower) in 7233 sequences in 1 files
     ## trfMask contributes:
     awk '{sum+=$3-$2} END{print sum}' ../simpleRepeat/trfMask.bed
     #	16534332
     ## and we measured wmsk.sdust before: 741001602
     16534332 + 741001602
 
 #########################################################################
 ## BLASTZ Hg18 swap (DONE - 2007-02-18 - Hiram)
     ##	the original blastz to hg18 measured
     time nice -n +19 featureBits hg18 chainAnoCar1Link \
 	> fb.hg18.chainAnoCar1Link.txt 2>&1
     #	real    2m28.318s
     #	137554843 bases of 2881515245 (4.774%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/anoCar1/bed/blastz.hg18.swap
     cd /cluster/data/anoCar1/bed/blastz.hg18.swap
     time doBlastzChainNet.pl \
 	/cluster/data/hg18/bed/blastz.anoCar1.2007-02-17/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-verbose=2 -bigClusterHub=pk -swap > swap.log 2>&1 &
 
     time nice -n +19 featureBits anoCar1 chainHg18Link \
 	> fb.anoCar1.chainHg18Link.txt 2>&1  
     #	real    3m16.810s
     #	112434396 bases of 1741478929 (6.456%) in intersection
 
 #########################################################################
 # GENSCAN PREDICTIONS (DONE - 2006-05-03 - 2006-05-05 - Hiram)
     ssh kkstore05
     #	Create a 2bit file all hard masked
     cd /cluster/data/anoCar1
     time nice -n +10 twoBitToFa anoCar1.2bit stdout \
 	| maskOutFa stdin hard stdout \
 	    | faToTwoBit stdin anoCar1.hard.2bit
     #	make sure it still has all the unmasked sequence in it:
     nice -n +19 twoBitToFa anoCar1.hard.2bit stdout | faSize stdin
     #	1781602899 bases (781360259 N's 1000242640 real
     #	1000242640 upper 0 lower) in 7233 sequences in 1 files
 
     nice -n +19 twoBitToFa anoCar1.2bit stdout | faSize stdin
     #	1781602899 bases (40123970 N's 1741478929 real
     #	1000242640 upper 741236289 lower) in 7233 sequences in 1 files
     #	same number of total bases, the lowers have become Ns:
     #	781360259 == 741236289 + 40123970
     #	the lower "reals" disappear from the "real" count:
     #	1000242640 == 1741478929 - 741236289
 
     #	And, make sure there aren't any sequences in this lot that have
     #	become all N's with no sequence left in them:
     twoBitToFa anoCar1.hard.2bit stdout \
 	| faCount stdin > anoCar1.hard.faCount
     #	181 scaffolds end up with less than 100 bases of sequence left
     egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
 	| sort -k2nr | awk '{if ($2 < 100) { print }}' | wc -l
     #	181
     #	leaving 7.052 scaffolds with more than 100 bases of seqence left:
     egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
 	| sort -k2nr | awk '{if ($2 >= 100) { print }}' | wc -l
     #	7052
     #	make a list of those to extract their sequence:
     egrep -v "^#|^total" anoCar1.hard.faCount | awk '{print $1,$2-$7}' \
 	| sort -k2nr | awk '{if ($2 >= 100) { print $1 }}' \
 	| sort > hard.genscan.list
 
     twoBitToFa -seqList=hard.genscan.list anoCar1.hard.2bit genscan.hard.fa
     #	What do we have left to work with:
     faSize genscan.hard.fa
     #	1780575355 bases (780338843 N's 1000236512 real
     #	1000236512 upper 0 lower) in 7052 sequences in 1 files
 
 
     #	creating 4,000,000 sized chunks, the largest scaffolds remain
     #	in single pieces, the scaffolds smaller than 4,000,000 are grouped
     #	into 4,000,000 sized fasta files.  You don't want to break these
     #	things up because genscan will be doing its own internal 2.4 million
     #	window on these pieces, and the gene names are going to be
     #	constructed from the sequence name in these fasta files.  The
     #	gene names are much better when they are this simple scaffoldN.M
     #	numbering scheme.
     mkdir genscanSplit
     faSplit about genscan.hard.fa 4000000 genscanSplit/c_
 
     ssh hgwdev
     mkdir /cluster/data/anoCar1/bed/genscan
     cd /cluster/data/anoCar1/bed/genscan
     # Check out hg3rdParty/genscanlinux to get latest genscan:
     cvs co hg3rdParty/genscanlinux
 
     # Run on small cluster (more mem than big cluster).
     ssh kki
     cd /cluster/data/anoCar1/bed/genscan
     # Make 3 subdirectories for genscan to put their output files in
     mkdir gtf pep subopt
     # Generate a list file, genome.list, of all the hard-masked contigs that 
     # *do not* consist of all-N's (which would cause genscan to blow up)
     #	Since we split on gaps, we have no chunks like that.  You can
     #	verify with faCount on the chunks.
     ls -1S /cluster/data/anoCar1/genscanSplit/c_*.fa > chunk.list
 
     # Create template file, gsub, for gensub2.  For example (3-line file):
     cat << '_EOF_' > template
 #LOOP
 /cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
 #ENDLOOP
 '_EOF_'
     # << happy emacs
     gensub2 chunk.list single template jobList
     para create jobList
     para try, check, push, check, ...
 # Completed: 319 of 319 jobs
 # CPU time in finished jobs:      39037s     650.62m    10.84h    0.45d  0.001 y
 # IO & Wait Time:                  1015s      16.91m     0.28h    0.01d  0.000 y
 # Average job time:                 126s       2.09m     0.03h    0.00d
 # Longest finished job:             459s       7.65m     0.13h    0.01d
 # Submission to last job:          3495s      58.25m     0.97h    0.04d
 
     # cat results into single files
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/genscan
     cat gtf/c_*.gtf > genscan.gtf
     cat subopt/c_*.bed > genscanSubopt.bed
     cat pep/c_*.pep > genscan.pep
 
     # Load into the database as so:
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/genscan
     ldHgGene anoCar1 -gtf genscan genscan.gtf
     #	Read 28102 transcripts in 184045 lines in 1 files
     #	28102 groups 4190 seqs 1 sources 1 feature types
     #	28102 gene predictions
 
     hgPepPred anoCar1 generic genscanPep genscan.pep
     hgLoadBed -strict anoCar1 genscanSubopt genscanSubopt.bed
     #	Loaded 286327 elements of size 6
 
     #	check the numbers
     time nice -n +19 featureBits anoCar1 genscan
     #	31394034 bases of 1741478929 (1.803%) in intersection
 
 #########################################################################
 ## BLASTZ GasAcu1/Stickleback swap (DONE - 2007-02-19 - Hiram)
     ##	the original blastz to gasAcu1 measured
     time nice -n +19 featureBits gasAcu1 chainAnoCar1Link \
 	> fb.gasAcu1.chainAnoCar1Link.txt 2>&1
     #	real    0m51.499s
     #	56386298 bases of 446627861 (12.625%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
     cd /cluster/data/anoCar1/bed/blastz.gasAcu1.swap
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/gasAcu1/bed/blastz.anoCar1.2007-02-19/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log 2>&1 &
 
     time nice -n +19 featureBits anoCar1 chainGasAcu1Link \
 	> fb.anoCar1.chainGasAcu1Link.txt 2>&1  
     #	real    1m14.245s
     #	54464074 bases of 1741478929 (3.127%) in intersection
 
 
 ###########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE braney 2007-02-19)
     ssh kkstore05
     bash # if not using bash shell already
 
     mkdir /cluster/data/anoCar1/blastDb
     cd /cluster/data/anoCar1
     twoBitToFa anoCar1.unmasked.2bit temp.fa
     faSplit sequence temp.fa 500 blastDb/
     rm temp.fa
     cd blastDb
     for i in *.fa
     do
 	/cluster/bluearc/blast229/formatdb -i $i -p F
     done
     rm *.fa
 
     mkdir -p /san/sanvol1/scratch/anoCar1/blastDb
     cd /cluster/data/anoCar1/blastDb
     for i in nhr nin nsq; 
     do 
 	echo $i
 	cp *.$i /san/sanvol1/scratch/anoCar1/blastDb
     done
 
     mkdir -p /cluster/data/anoCar1/bed/tblastn.hg18KG
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG
     echo  /san/sanvol1/scratch/anoCar1/blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 496 query.lst
 
    # we want around 200000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
 # 36727/(200000/496) = 91.082960
 
    mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa
    split -l 90 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa/kg
    ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/kgfa kgfa
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
      done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
    ln -s /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/anoCar1/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/cluster/bluearc/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.2
 
         if pslCheck -prot $3.tmp                                                  
         then                                                                      
             mv $3.tmp $3                                                          
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0                                                                    
     fi                                                                            
 fi                                                                                
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     gensub2 query.lst kg.lst blastGsub blastSpec
     # back to bash
     exit 
     
     ssh pk
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 
 # Completed: 202864 of 202864 jobs
 # CPU time in finished jobs:   14747966s  245799.43m  4096.66h  170.69d  0.468 y
 # IO & Wait Time:               1561940s   26032.34m   433.87h   18.08d  0.050 y
 # Average job time:                  80s       1.34m     0.02h    0.00d
 # Longest finished job:             722s      12.03m     0.20h    0.01d
 # Submission to last job:         93277s    1554.62m    25.91h    1.08d
 
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/c.`basename $1`.psl)
 '_EOF_'
     exit
     chmod +x chainOne
     ls -1dS /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     ssh kk
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG/chainRun
     para create chainSpec
     para maxNode 30
     para try, check, push, check etc.
 # Completed: 409 of 409 jobs
 # CPU time in finished jobs:       2911s      48.52m     0.81h    0.03d  0.000 y
 # IO & Wait Time:                 65231s    1087.18m    18.12h    0.75d  0.002 y
 # Average job time:                 167s       2.78m     0.05h    0.00d
 # Longest finished job:             287s       4.78m     0.08h    0.00d
 # Submission to last job:          2318s      38.63m     0.64h    0.03d
 
     ssh kkstore05
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/anoCar1/bed/tblastn.hg18KG/blastHg18KG.psl
     cd ..
     pslCheck blastHg18KG.psl
 
     # load table 
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/tblastn.hg18KG
     hgLoadPsl anoCar1 blastHg18KG.psl
     # check coverage
     featureBits anoCar1 blastHg18KG 
 # 21571582 bases of 1741478929 (1.239%) in intersection
     
     ssh kkstore05
     rm -rf /cluster/data/anoCar1/bed/tblastn.hg18KG/blastOut
     rm -rf /cluster/bluearc/anoCar1/bed/tblastn.hg18KG/blastOut
 #end tblastn
 
 #########################################################################
 ## BLASTZ galGal3/Chicken swap (DONE - 2007-02-19 - Hiram)
     ##	the original blastz to galGal3 measured
     time nice -n +19 featureBits galGal3 chainAnoCar1Link \
 	> fb.galGal3.chainAnoCar1Link.txt 2>&1
     #	real    0m43.752s
     #	106743952 bases of 1042591351 (10.238%) in intersection
 
     ssh kkstore05
     mkdir /cluster/data/anoCar1/bed/blastz.galGal3.swap
     cd /cluster/data/anoCar1/bed/blastz.galGal3.swap
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/galGal3/bed/blastz.anoCar1.2007-02-18/DEF \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-swap > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/blastz.galGal3.swap
     time nice -n +19 featureBits anoCar1 chainGalGal3Link \
 	> fb.anoCar1.chainGalGal3Link.txt 2>&1
     #	real    1m8.359s
     #	109074507 bases of 1741478929 (6.263%) in intersection
 
 #########################################################################
 # MAKE 11.OOC FILE FOR BLAT (DONE - 2007-02-19 - Hiram)
     # This will find repeats within the genome that should not be matched
     # against. Uses 11-mers.
     # Use -repMatch=620 (based on size -- for human we use 1024, and
     # lizard size is ~60.4% of human judging by gapless anoCar1 vs. hg18
     # genome sizes from featureBits.
 
     # hg18 / anoCar1 non-gap bases
     #	2881515245 / 1741478929 = 1.654636
     # anoCar1 / hg18 non-gap bases
     #	1741478929 / 2881515245 = 0.604362
 
     #	thus 1024 * 0.604362 ~= 620
     
     ssh kkstore05
     blat /cluster/data/anoCar1/anoCar1.2bit /dev/null /dev/null -tileSize=11 \
       -makeOoc=/cluster/data/anoCar1/11.ooc -repMatch=620
     # Wrote 32070 overused 11-mers to /cluster/data/anoCar1/11.ooc
 
     cp -p /cluster/data/anoCar1/11.ooc /san/sanvol1/scratch/anoCar1
     cp -p /cluster/data/anoCar1/jkStuff/liftAll.lft \
 	/san/sanvol1/scratch/anoCar1
 
 #########################################################################
 # GENBANK AUTO UPDATE (DONE - 2007-02-20 - Hiram)
     # Make a liftAll.lft that specifies 5M chunks for genbank:
     #	only a few of the largest scaffolds will be broken up, most of them not
     ssh kkstore05
     cd /cluster/data/anoCar1
     simplePartition.pl anoCar1.2bit 5000000 /tmp/anoCar1
     find /tmp/anoCar1 -type f | grep lft | xargs cat > jkStuff/liftAll.lft
     rm -r /tmp/anoCar1
     cp -p jkStuff/liftAll.lft /san/sanvol1/scratch/anoCar1
 
     # align with latest genbank process.
     ssh hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # edit etc/genbank.conf to add anoCar1 just after xenTro2
     # anoCar1
     anoCar1.serverGenome = /cluster/data/anoCar1/anoCar1.2bit
     anoCar1.clusterGenome = /san/sanvol1/scratch/anoCar1/anoCar1.2bit
     anoCar1.ooc = /san/sanvol1/scratch/anoCar1/11.ooc
     anoCar1.lift = /san/sanvol1/scratch/anoCar1/liftAll.lft
     anoCar1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
     anoCar1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
     anoCar1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
     anoCar1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
     anoCar1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
     anoCar1.refseq.mrna.native.load = yes
     anoCar1.genbank.est.native.load = no
     anoCar1.refseq.mrna.xeno.load = yes
     anoCar1.genbank.mrna.xeno.load = yes
     anoCar1.downloadDir = anoCar1
     anoCar1.perChromTables = no
 
 
     cvs ci -m "Added anoCar1." etc/genbank.conf
     # update /cluster/data/genbank/:
     make etc-update
 
     # Edit src/lib/gbGenome.c to add new species.
     #
     cvs ci -m "Added Anolis carolinensis (lizard)." src/lib/gbGenome.c
     make install-server
 
     cd /cluster/data/genbank
     screen
 
     # This is a call to a script that will push our jobs out to the cluster
     # since it's a big job.  
     nice -n +19 bin/gbAlignStep -initial anoCar1 &
     # logFile: var/build/logs/2007.02.19-22:10:25.anoCar1.initalign.log
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
     #	real    8m9.108s
 
     # enable daily alignment and update of hgwdev (DONE - 2007-02-20 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add anoCar1 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added anoCar1." etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ###   (2007-05-16 markd)
     # modify genbank to not load native RefSeq, since there are none.
     # remove empty files and rerun gbDbLoadStep
     anoCar1.refseq.mrna.native.load = no
     nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad anoCar1
     
 
 ##########################################################################
 ## BLASTZ FROG xenTro2 (WORKING - 2007-02-20 - Hiram)
     ssh kkstore04
     mkdir /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
     cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
 
     cat << '_EOF_' > DEF
 # Lizard vs frog
 BLASTZ_H=2000
 BLASTZ_Y=3400
 BLASTZ_L=8000
 BLASTZ_K=2200
 BLASTZ_M=50
 BLASTZ_Q=/cluster/data/blastz/HoxD55.q
 
 # TARGET: Lizard AnoCar1 - largest chunk big enough for largest scaffold
 SEQ1_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
 SEQ1_LEN=/san/sanvol1/scratch/anoCar1/chrom.sizes
 SEQ1_CHUNK=20000000
 SEQ1_LAP=10000
 SEQ1_LIMIT=30
 
 # TARGET: Frog xenTro2 - single chunk big enough for the largest scaffold
 SEQ2_DIR=/san/sanvol1/scratch/xenTro2/xenTro2.sdTrf.2bit
 SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
 SEQ2_CHUNK=8000000
 SEQ2_LIMIT=30
 SEQ2_LAP=0
 
 BASE=/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20
 TMPDIR=/scratch/tmp
 '_EOF_'
     # << happy emacs
 
     time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > do.log 2>&1 &
     ## real    1522m46.550s
     ## this broke down during chaining because this file:
 # -rw-rw-r--  1 5982855 Feb 21 14:39 ../../pslParts/part039.lst.psl.g
     ## has a bogus character control-V in the middle on a number
     ## manually running through that one chain step with that line removed
     ##	got the chaining completed, then continuing:
     time doBlastzChainNet.pl -verbose=2 DEF -bigClusterHub=pk \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-continue=chainMerge \
 	-blastzOutRoot /cluster/bluearc/anoCar1XenTro2 > chainMerge.log 2>&1 &
     #	real    119m12.634s
 
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20]
     time nice -n +19 featureBits anoCar1 chainXenTro2Link \
 	> fb.anoCar1.chainXenTro2Link.txt 2>&1
     #	real    11m33.086s
     #	83873500 bases of 1741478929 (4.816%) in intersection
 
     ssh kkstore04
     mkdir /cluster/data/xenTro2/bed/blastz.anoCar1.swap
     cd /cluster/data/xenTro2/bed/blastz.anoCar1.swap
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/anoCar1/bed/blastz.xenTro2.2007-02-20/DEF \
 	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-swap > swap.log 2>&1 &
 
 
 ############################################################################
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("anoCar1", "blat11", "17780", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("anoCar1", "blat11", "17781", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 #########################################################################
 ## BLASTZ mm8/Mouse swap (DONE - 2007-02-20 - Hiram)
     ##	the original blastz to mm8 measured
     time nice -n +19 featureBits mm8 chainAnoCar1Link \
 	> fb.mm8.chainAnoCar1Link.txt 2>&1
     #	real    1m37.380s
     #	106743952 bases of 1042591351 (10.238%) in intersection
 
     ssh kkstore04
     mkdir /cluster/data/anoCar1/bed/blastz.mm8.swap
     cd /cluster/data/anoCar1/bed/blastz.mm8.swap
 
     time doBlastzChainNet.pl -verbose=2 \
 	/cluster/data/mm8/bed/blastz.anoCar1.2007-02-19/DEF \
 	-chainMinScore=5000 -chainLinearGap=loose \
 	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
 	-bigClusterHub=pk \
 	-swap > swap.log 2>&1 &
 
     ssh hgwdev
     cd /cluster/data/anoCar1/bed/blastz.mm8.swap
     time nice -n +19 featureBits anoCar1 chainMm8Link \
 	> fb.anoCar1.chainMm8Link.txt 2>&1
     #	real    2m1.527s
     #	82784787 bases of 1741478929 (4.754%) in intersection
 
     #	% Coverage of Lizard by:    (chainMinScore,chainLinearGap,type masking)
     #	6.456 - Human hg18	(5000,loose,windowMasker)
     #	6.263 - Chicken galGal3	(5000,loose,windowMasker)
     #	4.816 - Frog xenTro2	(5000,loose,windowMasker)
     #	4.754 - Mouse mm8	(5000,loose,windowMasker)
     #	3.127 - Stickleback gasAcu1	(5000,loose,windowMasker)
 
     #	% coverage of Chicken by:
     #	10.238 - Lizard anoCar1	(5000,loose,windowMasker)
     #	8.795 - Human hg18	(5000,loose,rmsk)
     #	6.745 - Mouse mm8	(5000,loose,rmsk)
     #	5.330 - Frog xenTro2	(5000,loose,rmsk)
     #	3.144 - Stickleback gasAcu1	(2000,loose,windowMasker)
 
     #	% coverage of Frog by:
     #	6.217 - Lizard anoCar1	(5000,loose,windowMasker)
     #	5.634 - Human hg18	(5000,loose,rmsk)
     #	5.358 - Mouse mm8	(5000,loose,rmsk)
     #	4.776 - Chicken galGal3	(5000,loose,rmsk)
     #	x.xxx - Stickleback gasAcu1	(not yet done)
 
     #	% coverage of Human by:
     #	34.514 - Mouse mm8	(3000,medium,rmsk)
     #	4.774 - Lizard anoCar1	(5000,loose,windowMasker)
     #	3.589 - Chicken galGal3	(5000,loose,rmsk)
     #	2.623 - Frog xenTro2	(5000,loose,rmsk)
     #	1.923 - Stickleback gasAcu1	(2000,loose,rmsk)
 
 ##########################################################################
 ## RepeatMasker run to cover all bases (DONE - 2007-03-07 - Hiram)
     ssh kkstore02
     mkdir /cluster/data/anoCar1/bed/RepeatMasker
     cd /cluster/data/anoCar1/bed/RepeatMasker
     time nice -n +19 doRepeatMasker.pl -verbose=2 -bigClusterHub=kk \
 	-buildDir=/cluster/data/anoCar1/bed/RepeatMasker anoCar1 > do.log 2>&1 &
 
 ############################################################################
 ##  DOWNLOADS - (DONE - 2007-02-12 - 2007-02-16 - Hiram)
     ssh hgwdev
     cd /cluster/data/anoCar1
     ln -s bed/RepeatMasker/anoCar1.fa.out .
 
     ~/kent/src/hg/utils/automation/makeDownloads.pl anoCar1 \
 	> makeDownloads.out 2>&1
     #	Doesn't work due to missing Repeat masker outputs
     #	Create WindowMasker separate files by chrom, for downloads
     ssh kkstore05
     cd /cluster/data/anoCar1/goldenPath/bigZips
     ln -s ../../bed/WindowMasker.2007-02-16/windowmasker.sdust.bed.gz \
 	./anoCar1.WMSdust.bed.gz
 
     # get GenBank native mRNAs
     ssh hgwdev
     cd /cluster/data/genbank
     ./bin/x86_64/gbGetSeqs -db=anoCar1 -native \
 	GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/mrna.fa
     # get GenBank xeno mRNAs
     ./bin/x86_64/gbGetSeqs -db=anoCar1 -xeno \
 	GenBank mrna /cluster/data/anoCar1/goldenPath/bigZips/xenoMrna.fa
     ssh kkstore05
     cd /cluster/data/anoCar1/goldenPath/bigZips
     gzip *.fa
     md5sum *.gz > md5sum.txt
     #	Edit the README.txt file to be correct
     ssh hgwdev
     cd /usr/local/apache/htdocs/goldenPath/anoCar1/bigZips
     ln -s /cluster/data/anoCar1/goldenPath/bigZips/* .
 
 ############################################################################
 ## Default position set at IFG-1 (DONE - 2007-04-09 - Hiram)
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="scaffold_72:3056494-3141055"
 	where name="anoCar1";' hgcentraltest
 
 
 ############################################################################
 # SWAP ORNANA1 CHAIN/NET (DONE 5/2/07 angie)
     ssh kkstore05
     mkdir /cluster/data/anoCar1/bed/blastz.ornAna1.swap
     cd /cluster/data/anoCar1/bed/blastz.ornAna1.swap
     doBlastzChainNet.pl -swap \
       /cluster/data/ornAna1/bed/blastz.anoCar1/DEF >& do.log &
     tail -f do.log
     ln -s blastz.ornAna1.swap /cluster/data/anoCar1/bed/blastz.ornAna1
 
 
 ############################################################################
 # SWAP Mouse Mm9 chain/net (DONE - 2007-09-21 - hiram)
     ssh kkstore04
     screen # control this sequence with screen
     #	the original
     cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
     cat  fb.mm9.chainAnoCar1Link.txt
     #	89239796 bases of 2620346127 (3.406%) in intersection
 
     #	and for the swap
     mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
     cd /cluster/data/anoCar1/bed/blastz.mm9.swap
     time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
 	/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
 	-swap -qRepeats=windowmaskerSdust \
 	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
     #	real    29m12.291s
     cat fb.anoCar1.chainMm9Link.txt
     #	85923556 bases of 1741478929 (4.934%) in intersection
 
 #########################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20
 
 see doc/builds.txt for specific details.
 ############################################################################
 ############################################################################
 # TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
 by a single Makefile. This is available from:
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30
 
 see doc/builds.txt for specific details.
 ############################################################################
+############################################################################
+# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
+
+vertebrate-wide transMap alignments were built  Tracks are created and loaded
+by a single Makefile. This is available from:
+   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
+
+see doc/builds.txt for specific details.
+############################################################################