src/hg/makeDb/doc/loxAfr3.txt 1.4

1.4 2009/08/27 18:05:07 hiram
lastz alignments done between mouse and elephant
Index: src/hg/makeDb/doc/loxAfr3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/loxAfr3.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 1000000 -r1.3 -r1.4
--- src/hg/makeDb/doc/loxAfr3.txt	23 Jul 2009 15:14:50 -0000	1.3
+++ src/hg/makeDb/doc/loxAfr3.txt	27 Aug 2009 18:05:07 -0000	1.4
@@ -1,304 +1,324 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how we made the elephant browser database on
 #  Broad Institute loxAfr3 (NCBI project 12569, AAGU03000000)
 
 #	"$Id$";
 
 #############################################################################
 # Download sequence (DONE - 2009-07-15 - Hiram)
     mkdir -p /hive/data/genomes/loxAfr3/broad
     /hive/data/genomes/loxAfr3/broad
     wget --timestamping \
 "ftp://ftp.broadinstitute.org/pub/assemblies/mammals/elephant/loxAfr3/*"
 
     #	lift quality scores to scaffold coordinates
     qaToQac assembly.quals.gz assembly.quals.qac
     qacAgpLift assembly.agp assembly.quals.qac loxAfr3.quals.qac
 
 #############################################################################
 # Elephant loxAfr3 browser initialization (DONE - 2009-07-15 - Hiram)
     cd /hive/data/genomes/loxAfr3
     cat << '_EOF_' > loxAft3.config.ra
 # Config parameters for makeGenomeDb.pl:
 db loxAfr3
 clade mammal
 genomeCladePriority 35
 scientificName Loxodonta africana
 commonName Elephant
 assemblyDate Jul. 2009
 assemblyLabel Broad Institute loxAfr3 (NCBI project 12569, AAGU03000000)
 orderKey 340
 mitoAcc NC_000934
 fastaFiles /hive/data/genomes/loxAfr3/broad/assembly.bases.gz
 agpFiles /hive/data/genomes/loxAfr3/broad/assembly.agp
 qualFiles /hive/data/genomes/loxAfr3/broad/loxAfr3.quals.qac
 dbDbSpeciesDir elephant
 taxId 9785
 '_EOF_'
     # << happy emacs
     #	run stepwise to verify each step
     makeGenomeDb.pl -stop=seq loxAfr3.config.ra > seq.log 2>&1
     makeGenomeDb.pl -continue=agp -stop=agp loxAfr3.config.ra > agp.log 2>&1
     makeGenomeDb.pl -continue=db -stop=db loxAfr3.config.ra > db.log 2>&1
     makeGenomeDb.pl -continue=dbDb -stop=dbDb loxAfr3.config.ra > dbDb.log 2>&1
     makeGenomeDb.pl -continue=trackDb loxAfr3.config.ra > trackDb.log 2>&1
 
 
 #############################################################################
 # loxAfr3 repeatMasker (DONE - 2009-07-15 - Hiram)
     mkdir /hive/data/genomes/loxAfr3/bed/repeatMasker
     cd /hive/data/genomes/loxAfr3/bed/repeatMasker
     doRepeatMasker.pl -buildDir=`pwd` loxAfr3 > do.log 2>&1
     #	about 6 hours
     cat faSize.rmsk.txt
     #	3196760833 bases (78195493 N's 3118565340 real 1633809371 upper
     #	1484755969 lower) in 2353 sequences in 1 files
 
 #############################################################################
 # loxAfr3 simpleRepeat (DONE - 2009-07-15 - Hiram)
     mkdir /hive/data/genomes/loxAfr3/bed/simpleRepeat
     cd /hive/data/genomes/loxAfr3/bed/simpleRepeat
     time doSimpleRepeat.pl -buildDir=`pwd` loxAfr3 > do.log 2>&1
     #	real    23m42.536s
     cat fb.simpleRepeat
     #	27746420 bases of 3118565340 (0.890%) in intersection
 
     #	add to RM after done above:
     twoBitMask bed/repeatMasker/loxAfr3.clean.2bit \
 	-add bed/simpleRepeat/trfMask.bed loxAfr3.2bit
     twoBitToFa loxAfr3.2bit stdout | faSize stdin > faSize.loxAfr3.2bit.txt
 
 #############################################################################
 # create ooc file and populate /scratch/data (DONE - 2009-07-16 - Hiram)
     #	repMatch = 1024 * sizeof(loxAfr3)/sizeof(hg19)
     #	1102 = 1024 * (3118565340/2897310462)
     time blat loxAfr3.2bit \
 	/dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/loxAfr3.11.ooc \
 	-repMatch=1100
     #	Wrote 41026 overused 11-mers to jkStuff/loxAfr3.11.ooc
 
     mkdir /hive/data/staging/data/loxAfr3
     cp -p loxAfr3.2bit /hive/data/staging/data/loxAfr3
     cp -p jkStuff/loxAfr3.11.ooc /hive/data/staging/data/loxAfr3
     cp -p chrom.sizes /hive/data/staging/data/loxAfr3
 
     #	request push to kluster nodes
 
 ##########################################################################
 ## GENBANK alignments (DONE - 2007-08-03 - Hiram)
     cd $HOME/kent/src/hg/makeDb/genbank/etc
     cvs up
     # edit genbank.conf and add the following entry just above loxAfr1:
 # loxAfr3 (elephant)
 loxAfr3.serverGenome = /hive/data/genomes/loxAfr3/loxAfr3.2bit
 loxAfr3.clusterGenome = /scratch/data/loxAfr3/loxAfr3.2bit
 loxAfr3.ooc = /scratch/data/loxAfr3/loxAfr3.11.ooc
 loxAfr3.lift = no
 loxAfr3.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
 loxAfr3.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
 loxAfr3.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
 loxAfr3.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
 loxAfr3.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
 loxAfr3.refseq.mrna.native.load = yes
 loxAfr3.refseq.mrna.xeno.load = yes
 loxAfr3.genbank.mrna.xeno.load = yes
 loxAfr3.genbank.est.native.load = no
 loxAfr3.downloadDir = loxAfr3
 loxAfr3.perChromTables = no
 
     # after commiting that edit, install thusly:
     cd $HOME/kent/src/hg/makeDb/genbank
     make etc-update
 
     ssh genbank
     screen	# use a screen to manage this long lived job
     cd /cluster/data/genbank
     time nice -n +19 bin/gbAlignStep -initial loxAfr3 &
     ##	logFile: var/build/logs/2009.07.21-10:14:29.loxAfr3.initalign.log
     #	real    174m1.258s
 
     ssh hgwdev
     cd /cluster/data/genbank
     time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad loxAfr3
     #	var/dbload/hgwdev/logs/2009.07.21-13:22:25.dbload.log
     #	real    14m0.774s
 
     featureBits loxAfr3 xenoMrna
     #	65367813 bases of 3118565340 (2.096%) in intersection
     featureBits loxAfr3 xenoRefGene
     #	50364043 bases of 3118565340 (1.615%) in intersection
     featureBits loxAfr3 all_mrna
     #	15550 bases of 3118565340 (0.000%) in intersection
 
     # enable daily alignment and update of hgwdev (DONE - 2009-07-21 - Hiram)
     cd ~/kent/src/hg/makeDb/genbank
     cvsup
     # add mm9 to:
         etc/align.dbs
         etc/hgwdev.dbs
     cvs ci -m "Added loxAfr3 - Loxodonta africana" etc/align.dbs etc/hgwdev.dbs
     make etc-update
 
 ##########################################################################
 # HUMAN (hg18) PROTEINS TRACK (DONE 2009-07-23 braney )
     # bash  if not using bash shell already
 
     cd /cluster/data/loxAfr3
     mkdir /cluster/data/loxAfr3/blastDb
 
     awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
     twoBitToFa -seqList=1meg.lst  loxAfr3.2bit temp.fa
     faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
     rm temp.fa 1meg.lst
 
     awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
     twoBitToFa -seqList=less1meg.lst  loxAfr3.2bit temp.fa
     faSplit about temp.fa 1000000 blastDb/y 
     rm temp.fa less1meg.lst
 
     cd blastDb
     for i in *.fa
     do
 	/hive/data/outside/blast229/formatdb -i $i -p F
     done
     rm *.fa
     ls *.nsq | wc -l
 # 3934
 
     mkdir -p /cluster/data/loxAfr3/bed/tblastn.hg18KG
     cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
     echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
     wc -l query.lst
 # 3934 query.lst
 
    # we want around 250000 jobs
    calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)
 
 # 36727/(250000/3934) = 577.936072
 
    mkdir -p kgfa
    split -l 578 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
    cd kgfa
    for i in *; do 
      nice pslxToFa $i $i.fa; 
      rm $i; 
    done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    wc kg.lst
 #  64  64 832 kg.lst
 
    mkdir -p blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
    cat << '_EOF_' > blastGsub
 #LOOP
 blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
 #ENDLOOP
 '_EOF_'
 
    cat << '_EOF_' > blastSome
 #!/bin/sh
 BLASTMAT=/hive/data/outside/blast229/data
 export BLASTMAT
 g=`basename $2`
 f=/tmp/`basename $3`.$g
 for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
 do
 if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
 then
         mv $f.8 $f.1
         break;
 fi
 done
 if test -f  $f.1
 then
     if /cluster/bin/i386/blastToPsl $f.1 $f.2
     then
 	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/loxAfr3/blastDb.lft carry $f.2
         liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
         if pslCheck -prot $3.tmp
         then                  
             mv $3.tmp $3     
             rm -f $f.1 $f.2 $f.3 $f.4
         fi
         exit 0               
     fi                      
 fi                         
 rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
 exit 1
 '_EOF_'
     # << happy emacs
     chmod +x blastSome
     exit 
     
     ssh swarm
     cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
     gensub2 query.lst kg.lst blastGsub blastSpec
     para create blastSpec
 #    para try, check, push, check etc.
 
     para time
 # Completed: 251776 of 251776 jobs
 # CPU time in finished jobs:   18571030s  309517.16m  5158.62h  214.94d  0.589 y
 # IO & Wait Time:               1488748s   24812.47m   413.54h   17.23d  0.047 y
 # Average job time:                  80s       1.33m     0.02h    0.00d
 # Longest finished job:             287s       4.78m     0.08h    0.00d
 # Submission to last job:         21050s     350.83m     5.85h    0.24d
 
     ssh swarm
     cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
     mkdir chainRun
     cd chainRun
     tcsh
     cat << '_EOF_' > chainGsub
 #LOOP
 chainOne $(path1)
 #ENDLOOP
 '_EOF_'
 
     cat << '_EOF_' > chainOne
 (cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=12000 stdin ../c.`basename $1`.psl)
 '_EOF_'
     chmod +x chainOne
     ls -1dS ../blastOut/kg?? > chain.lst
     gensub2 chain.lst single chainGsub chainSpec
     # do the cluster run for chaining
     para create chainSpec
     para try, check, push, check etc.
 
 # Completed: 64 of 64 jobs
 # CPU time in finished jobs:      29478s     491.30m     8.19h    0.34d  0.001 y
 # IO & Wait Time:                 19822s     330.37m     5.51h    0.23d  0.001 y
 # Average job time:                 770s      12.84m     0.21h    0.01d
 # Longest finished job:            2868s      47.80m     0.80h    0.03d
 # Submission to last job:          2882s      48.03m     0.80h    0.03d
 
     cd /cluster/data/loxAfr3/bed/tblastn.hg18KG/blastOut
     for i in kg??
     do
        cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
        echo $i
     done
     sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
     cd ..
     pslCheck blastHg18KG.psl
 # checked: 43305 failed: 0 errors: 0
 
     # load table 
     ssh hgwdev
     cd /cluster/data/loxAfr3/bed/tblastn.hg18KG
     hgLoadPsl loxAfr3 blastHg18KG.psl
 
     # check coverage
     featureBits loxAfr3 blastHg18KG 
 # 42480830 bases of 3118565340 (1.362%) in intersection
 
     featureBits loxAfr3 blastHg18KG xenoRefGene  -enrichment
 # blastHg18KG 1.362%, xenoRefGene 1.615%, both 0.695%, cover 51.03%, enrich 31.60x
     rm -rf blastOut
 #end tblastn
+
+#########################################################################
+# lastz swap from Mouse Mm9 (DONE - 2009-08-27 - Hiram)
+    # original alignment
+    cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
+    cat fb.mm9.chainLoxAfr3Link.txt
+    #	684326090 bases of 2620346127 (26.116%) in intersection
+
+    mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
+    cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
+    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \
+	-swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
+	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
+	-syntenicNet > swap.log 2>&1 &
+    #	real    123m9.342s
+    cat fb.loxAfr3.chainMm9Link.txt 
+    #	673856452 bases of 3118565340 (21.608%) in intersection
+
+#########################################################################