src/hg/makeDb/doc/danRer6.txt 1.19

1.19 2010/05/26 19:27:52 galt
multiz6way, annotated, mafSummary, phylotree, phastCons
Index: src/hg/makeDb/doc/danRer6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer6.txt,v
retrieving revision 1.18
retrieving revision 1.19
diff -b -B -U 4 -r1.18 -r1.19
--- src/hg/makeDb/doc/danRer6.txt	5 Apr 2010 18:20:01 -0000	1.18
+++ src/hg/makeDb/doc/danRer6.txt	26 May 2010 19:27:52 -0000	1.19
@@ -1042,9 +1042,9 @@
       /usr/local/apache/htdocs/goldenPath/danRer6/chromosomes
 
 
 #########################################################################
-## 6-Way Multiz (WORKING - 2010-02-01 - Galt)
+## 6-Way Multiz (DONE - 2010-02-01 - Galt)
     ssh hgwdev
     mkdir /cluster/data/danRer6/bed/multiz6way
     cd /cluster/data/danRer6/bed/multiz6way
 
@@ -1378,4 +1378,369 @@
     doSameSpeciesLiftOver.pl -debug danRer6 danRer5
     cd /hive/data/genomes/danRer6/bed/blat.danRer5.2010-04-02
     doSameSpeciesLiftOver.pl danRer6 danRer5 >& do.log & tail -f do.log
 
+
+#########################################################################
+# Phylogenetic tree from 6-way (DONE 2010-05-20 galt)
+# 	We need one tree for all chroms
+
+    cd /hive/data/genomes/danRer6/bed/multiz6way/
+    mkdir mafSplit
+    cd mafSplit
+    mafSplit -byTarget -useFullSequenceName /dev/null . ../multiz6way.maf
+    # got 6660 mafs named after their chrom/scaff .maf
+    # although there are over 11,000 chroms and scaffolds,
+    # some are too small or have nothing aligning.
+
+    mkdir /hive/data/genomes/danRer6/bed/multiz6way/4d
+    cd /hive/data/genomes/danRer6/bed/multiz6way/4d
+
+    # danRer6 cannot be too picky, dropping this clause: 
+    #   refSeqStatus.status='Reviewed' 
+    hgsql danRer6 -Ne \
+    "select * from refGene,refSeqStatus where refGene.name=refSeqStatus.mrnaAcc and mol='mRNA'" \
+	| cut -f 2-20 | egrep -E -v "chrM" \
+	> refSeqReviewed.gp
+    wc -l refSeqReviewed.gp
+    # 15261 refSeqReviewed.gp
+    genePredSingleCover refSeqReviewed.gp stdout | sort > refSeqReviewedNR.gp
+    wc -l refSeqReviewedNR.gp
+    # 14609 refSeqReviewedNR.gp
+
+    ssh memk
+    mkdir /hive/data/genomes/danRer6/bed/multiz6way/4d/run
+    cd /hive/data/genomes/danRer6/bed/multiz6way/4d/run
+    mkdir ../mfa
+
+# whole chrom mafs version, using new version of 
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
+#	mjhubisz at gmail.com
+
+    cat << '_EOF_' > 4d.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set r = "/hive/data/genomes/danRer6/bed/multiz6way"
+set c = $1
+set infile = $r/mafSplit/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf    
+awk -v C=$c '$2 == C {print}' $r/4d/refSeqReviewedNR.gp > $c.gp
+set NL=`wc -l $c.gp| gawk '{print $1}'`
+if ("$NL" != "0") then
+    set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+    $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+    $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
+else
+    echo "" > $r/4d/run/$outfile
+endif
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+    # << happy emacs
+    chmod +x 4d.csh
+
+    ls -1S /hive/data/genomes/danRer6/bed/multiz6way/mafSplit/*.maf | \
+        egrep -E -v "chrM" \
+	| sed -e "s#.*multiz6way/mafSplit/##" \
+	> maf.list
+
+    cat << '_EOF_' > template
+#LOOP
+4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    gensub2 maf.list single template stdout | tac > jobList
+    para create jobList
+    para try ... check ... push ... etc
+    para time
+# Completed: 23 of 23 jobs
+# CPU time in finished jobs:       9032s     150.53m     2.51h    0.10d  0.000 y
+# IO & Wait Time:                   672s      11.21m     0.19h    0.01d  0.000 y
+# Average job time:                 422s       7.03m     0.12h    0.00d
+# Longest finished job:             860s      14.33m     0.24h    0.01d
+# Submission to last job:          1210s      20.17m     0.34h    0.01d
+
+    # combine mfa files
+    ssh hgwdev
+    cd /hive/data/genomes/danRer6/bed/multiz6way/4d
+    # but first clean out junk 1-byte leftovers from above process.
+    cd mfa
+    find -type f -size 1c | xargs -iX rm X
+    find -type f -size 0c | xargs -iX rm X
+    cd ..
+    #want comma-less species.lst
+    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+	--aggregate "`cat ../species.lst`" mfa/*.mfa | sed s/"> "/">"/ \
+	> 4d.all.mfa
+
+    # fix order in case it helps:
+    #((danRer6,(tetNig2,gasAcu1)),(xenTro2,(mm9,hg19)))
+
+    # use phyloFit to create tree model (output is phyloFit.mod)
+    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+	--EM --precision MED --msa-format FASTA --subst-mod REV \
+	--tree ../tree-commas.nh 4d.all.mfa
+
+
+#############################################################################
+# phastCons 6-way (WORKING - 2010-05-13 - Galt)
+    #	was unable to split the full chrom MAF files, now working on the
+    #	maf files as they were split up during multiz
+
+    # split 6way mafs into 10M chunks and generate sufficient statistics 
+    # files for # phastCons
+
+    ssh swarm
+    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18
+    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18
+    mkdir ss
+
+    cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/danRer6/bed/multiz6way/mafSplit/$c.maf
+set WINDOWS = /hive/data/genomes/danRer6/bed/multiz6way/cons/msa.split/2010-05-18/ss/$c
+set WC = `cat $MAF | wc -l`
+set NL = `grep "^#" $MAF | wc -l`
+if ( -s $2 ) then
+    exit 0
+endif
+if ( -s $2.running ) then
+    exit 0
+endif
+
+date >> $2.running
+
+rm -fr $WINDOWS
+mkdir $WINDOWS
+pushd $WINDOWS > /dev/null
+if ( $WC != $NL ) then
+/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
+    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
+popd > /dev/null
+date >> $2
+rm -f $2.running
+'_EOF_'
+    # << happy emacs
+    chmod +x doSplit.csh
+
+    cat << '_EOF_' > template
+#LOOP
+doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    #	do the easy ones first to see some immediate results
+    ls -1S -r ../../../mafSplit | sed -e "s/.maf//" > maf.list
+
+    gensub2 maf.list single template jobList
+    para -ram=8g create jobList
+    para try ... check ... etc
+
+# Completed: 6660 of 6660 jobs
+# CPU time in finished jobs:        520s       8.67m     0.14h    0.01d  0.000 y
+# IO & Wait Time:                 47106s     785.10m    13.08h    0.55d  0.001 y
+# Average job time:                   7s       0.12m     0.00h    0.00d
+# Longest finished job:              41s       0.68m     0.01h    0.00d
+# Submission to last job:           667s      11.12m     0.19h    0.01d
+# Estimated complete:                 0s       0.00m     0.00h    0.00d
+
+
+    # some scaffolds were too small to produce output. 
+    # expected 6660, found 4273.
+    cd ss
+    find -type f | wc -l
+    #4273
+    cd ..
+
+    # Run phastCons
+    #	This job is I/O intensive in its output files, beware where this
+    #	takes place or do not run too many at once.
+    ssh swarm
+    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/cons/run.cons
+    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/run.cons
+
+    #	there are going to be only one phastCons run using
+    #	this same script.  It triggers off of the current working directory
+    #	$cwd:t which is the "grp" in this script.  It is:
+    #	all 
+
+    cat << '_EOF_' > doPhast.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set c = $1
+set cX = $1:r
+set f = $2
+set len = $3
+set cov = $4
+set rho = $5
+set grp = $cwd:t
+set cons = /hive/data/genomes/danRer6/bed/multiz6way/cons
+set tmp = $cons/tmp/$f
+mkdir -p $tmp
+set ssSrc = $cons
+set useGrp = "$grp.mod"
+ln -s $ssSrc/msa.split/2010-05-18/ss/$c/$f.ss $tmp
+ln -s $cons/$grp/$grp.mod $tmp
+pushd $tmp > /dev/null
+$PHASTBIN/phastCons $f.ss $useGrp \
+    --rho $rho --expected-length $len --target-coverage $cov --quiet \
+    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+popd > /dev/null
+mkdir -p pp/$c bed/$c
+sleep 4
+touch pp/$c bed/$c
+rm -f pp/$c/$f.pp
+rm -f bed/$c/$f.bed
+mv $tmp/$f.pp pp/$c
+mv $tmp/$f.bed bed/$c
+rm -fr $tmp
+'_EOF_'
+    # << happy emacs
+    chmod a+x doPhast.csh
+
+    #	this template will serve for all runs
+    #	root1 == chrom name, file1 == ss file name without .ss suffix
+    cat << '_EOF_' > template
+#LOOP
+../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    ls -1S ../msa.split/2010-05-18/ss/*/* | sed -e 's/.ss$//' > ss.list
+
+    # Create parasol batch and run it
+    # run for all species
+    cd /hive/data/genomes/danRer6/bed/multiz6way/cons
+    mkdir -p all
+    cd all
+    #	Using the .mod tree
+    cp -p ../../4d/phyloFit.mod ./all.mod
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList
+    para try ... check ... push ... etc.
+# Completed: 4273 of 4273 jobs
+# CPU time in finished jobs:       2323s      38.72m     0.65h    0.03d  0.000 y
+# IO & Wait Time:                 68627s    1143.78m    19.06h    0.79d  0.002 y
+# Average job time:                  17s       0.28m     0.00h    0.00d
+# Longest finished job:              37s       0.62m     0.01h    0.00d
+# Submission to last job:           338s       5.63m     0.09h    0.00d
+
+
+    # create Most Conserved track
+    ssh hgwdev
+    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/all
+
+    cat bed/*/*.bed | sort -k1,1 -k2,2n \
+	| awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
+    	> tmpMostConserved.bed
+    /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
+
+    # load into database
+    nice hgLoadBed danRer6 phastConsElements6way mostConserved.bed
+    # Loaded 881975 elements of size 5
+    # Saving bed.tab
+    # start -1, end 426 out of range in findBin (max is 512M)
+[hgwdev:all> cat mostConserved.bed | gawk '{if ($3 == 426) print $0 }'
+Zv8_NA5813      -1      426     lod=93  380
+# there's only one record like this with chromStart == -1 
+
+it is the first record of a set
+[hgwdev:all> grep Zv8_NA5813 mostConserved.bed
+Zv8_NA5813      -1      426     lod=93  380
+Zv8_NA5813      520     666     lod=41  286
+Zv8_NA5813      742     832     lod=28  242
+
+[hgwdev:all> grep Zv8_NA5813 tmpMostConserved.bed
+Zv8_NA5813      -1      426     lod=93  93
+Zv8_NA5813      520     666     lod=41  41
+
+[hgwdev:all> cat bed/Zv8_NA5813/Zv8_NA5813.0-6928.bed
+Zv8_NA5813      -1      426     Zv8_NA5813.1    93      +
+Zv8_NA5813      520     666     Zv8_NA5813.1    41      +
+Zv8_NA5813      742     832     Zv8_NA5813.2    28      +
+
+# I have manually patched mostConserved.bed changing the -1 to 0 for 
+# Zv8_NA5813      -1      426     lod=93  93
+# and re-ran the same load command above this time without error.
+
+# also manually patched pp/Zv8_NA5813/Zv8_NA5813.0-6928.pp
+# and pp/Zv8_NA6933/Zv8_NA6933.0-2411.pp
+# so that it has in first line start=1 instead of start=0
+
+    # Try for 5% overall cov, and 70% CDS cov 
+    featureBits danRer6 -enrichment refGene:cds phastConsElements6way
+    #	--rho 0.3 --expected-length 45 --target-coverage 0.3
+    #   refGene:cds 1.244%, phastConsElements6way 12.225%, 
+    #   both 1.029%, cover 82.75%, enrich 6.77x
+
+    # hg19 for comparison
+    #	refGene:cds 1.187%, phastConsElements46way 5.065%,
+    #	both 0.884%, cover 74.46%, enrich 14.70x
+
+    # Create merged posterier probability file and wiggle track data files
+    cd /hive/data/genomes/danRer6/bed/multiz6way/cons/all
+    mkdir downloads
+
+    cat pp/*/*.pp | gzip > downloads/phastCons6way.wigFix.gz
+
+
+    #	encode those files into wiggle data
+    zcat downloads/phastCons6way.wigFix.gz \
+	| wigEncode stdin phastCons6way.wig phastCons6way.wib
+    #   Converted stdin, upper limit 1.00, lower limit 0.00
+    du -hsc *.wi?
+    # 453M    phastCons6way.wib
+    # 97M     phastCons6way.wig
+    # 549M    total
+
+
+    # Load gbdb and database with wiggle.
+    ln -s `pwd`/phastCons6way.wib /gbdb/danRer6/multiz6way/phastCons6way.wib
+    nice hgLoadWiggle -pathPrefix=/gbdb/danRer6/multiz6way danRer6 \
+	phastCons6way phastCons6way.wig
+
+  # *** BOOKMARK ***
+
+  # use to set trackDb.ra entries for wiggle min and max
+
+    wigTableStats.sh danRer6 phastCons6way
+# db.table             min max mean   count     sumData     stdDev   viewLimits
+#danRer6.phastCons6way   0 1 0.712241 237215280 1.68955e+08 0.337377 viewLimits=0:1
+
+    #  Create histogram to get an overview of all the data
+    hgWiggle -doHistogram -db=danRer6 \
+	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+	    phastCons6way >& histogram.data 
+
+    #	create plot of histogram:
+
+#orig set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Zebrafish danRer6 Histogram phastCons6way track"
+set xlabel " phastCons6way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #	<< happy emacs
+
+    display histo.png &
+