src/hg/makeDb/doc/sacCer2.txt 1.4
1.4 2009/02/12 19:50:19 hiram
Finished 7-way conservation business
Index: src/hg/makeDb/doc/sacCer2.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/sacCer2.txt,v
retrieving revision 1.3
retrieving revision 1.4
diff -b -B -U 4 -r1.3 -r1.4
--- src/hg/makeDb/doc/sacCer2.txt 10 Feb 2009 23:56:21 -0000 1.3
+++ src/hg/makeDb/doc/sacCer2.txt 12 Feb 2009 19:50:19 -0000 1.4
@@ -634,4 +634,293 @@
rm -rf blastOut
#end tblastn
###########################################################################
+## 7-Way Multiz (DONE - 2009-02-11 - Hiram)
+ mkdir /hive/data/genomes/sacCer2/bed/multiz7way
+ cd /hive/data/genomes/sacCer2/bed/multiz7way
+
+ # using the phylogenetic tree as mentioned at:
+ # http://www.genetics.wustl.edu/saccharomycesgenomes/yeast_phylogeny.html
+ # and from Jim's note: genomes/sacCer1/bed/otherYeast/README
+ # ((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)
+ echo "((((((sacCer2 sacPar) sacMik) sacKud) sacBay) sacCas) sacKlu)" \
+ > tree.nh
+ echo sacCer2 sacPar sacMik sacKud sacBay sacCas sacKlu > species.list
+ mkdir mafLinks
+ for S in sacPar sacMik sacKud sacBay sacCas sacKlu
+do
+ mkdir mafLinks/${S}
+ cd mafLinks/${S}
+ ln -s ../../../otherYeast/align/mafNet.${S}/*.maf.gz .
+ cd ../..
+done
+ find -L ./mafLinks -type f | xargs -L 1 basename \
+ | sed -e "s/.maf.gz//" | sort -u > maf.list
+
+ mkdir maf run
+ cd run
+ mkdir penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/multiz penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/maf_project penn
+ cp -p /cluster/bin/penn/multiz.2008-11-25/autoMZ penn
+
+ # set the db and pairs directories here
+ cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = sacCer2
+set c = $1
+set result = $2
+set run = `pwd`
+set tmp = $run/tmp/$db/multiz.$c
+set pairs = /hive/data/genomes/sacCer2/bed/multiz7way/mafLinks
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p ../tree.nh ../species.list $tmp
+pushd $tmp
+foreach s (`sed -e "s/ $db//" species.list`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if (-e $in.gz) then
+ /bin/zcat $in.gz > $out
+ else if (-e $in) then
+ ln -s $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($run/penn $path); rehash
+$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
+popd
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp
+'_EOF_'
+# << happy emacs
+ chmod +x autoMultiz.csh
+
+ cat << '_EOF_' > template
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ ../maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
+
+ gensub2 ../maf.list single template jobList
+ para create jobList
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 710s 11.83m 0.20h 0.01d 0.000 y
+# IO & Wait Time: 127s 2.12m 0.04h 0.00d 0.000 y
+# Average job time: 47s 0.78m 0.01h 0.00d
+# Longest finished job: 105s 1.75m 0.03h 0.00d
+# Submission to last job: 144s 2.40m 0.04h 0.00d
+# Estimated complete: 0s 0.00m 0.00h 0.00d
+
+ # load MAF tables
+ ssh hgwdev
+ mkdir -p /gbdb/sacCer2/multiz7way/maf
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/maf
+ ln -s `pwd`/*.maf /gbdb/sacCer2/multiz7way/maf
+
+ # allow the temporary multiz7way.tab file to be created in tmp
+ cd /data/tmp
+ time nice -n +19 hgLoadMaf \
+ -pathPrefix=/gbdb/sacCer2/multiz7way/maf sacCer2 multiz7way
+ # real 0m4.903s
+ # Loaded 38795 mafs in 18 files from /gbdb/sacCer2/multiz7way/maf
+ # load summary table
+ time nice -n +19 cat /gbdb/sacCer2/multiz7way/maf/*.maf \
+ | hgLoadMafSummary sacCer2 -mergeGap=1500 multiz7waySummary stdin
+ # real 0m5.536
+ # Created 3037 summary blocks from 73806 components and 38795 mafs
+
+#########################################################################
+## phastCons conservation (DONE - 2009-02-11 - Hiram)
+ # Create SS files for each chromosome:
+ mkdir /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ mkdir SS
+
+ # split up alignments; no need to use cluster here
+ MAF=/cluster/data/sacCer1/bed/otherYeast/align/mafOut
+ FA=/cluster/data/sacCer1
+ CHROMS="1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 M"
+ mkdir -p SS
+ for C in `(cd ../maf; ls *.maf | sed -e "s/.maf//")`
+do
+ echo $C
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ ../maf/${C}.maf -M ../../otherYeast/align/sacCer2/${C}.fa -i MAF \
+ -O sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
+ -o SS > SS/${C}.ss
+done
+
+ # produce rough starting model
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ -i SS --aggregate sacCer2,sacKud,sacMik,sacPar,sacBay,sacCas,sacKlu \
+ -o SS -z SS/*.ss > all.ss
+
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloFit \
+ -i SS all.ss \
+ --tree "((((((sacCer2,sacPar),sacMik),sacKud),sacBay),sacCas),sacKlu)" \
+ -o starting-tree
+
+ # (cheat the branch lengths up a bit, since this represents an
+ # average of conserved and nonconserved sites; we want to
+ # initialize for nonconserved)
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/tree_doctor \
+ --scale 2 starting-tree.mod > tmp.mod
+ mv tmp.mod starting-tree.mod
+
+ # also get average GC content of aligned regions
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/msa_view \
+ -i SS all.ss --summary-only
+#descrip. A C G T G+C length all_gaps some_gaps
+#all.ss 0.3056 0.1950 0.1948 0.3047 0.3898 13130139 0 2122279
+
+ # save some I/O and space
+ gzip SS/*.ss
+
+ # now set up cluster job to estimate model parameters. See
+ # other make{hg18|mm9}.doc for add'l details
+
+ cat << '_EOF_' > doEstimate.sh
+#!/bin/sh
+zcat SS/$1.ss.gz | \
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons \
+ - starting-tree.mod --gc 0.3898 --nrates 1,1 --no-post-probs \
+ --ignore-missing --expected-lengths 75 --target-coverage 0.5 \
+ --quiet --log $2 --estimate-trees $3
+'_EOF_'
+ # << happy emacs
+ chmod +x doEstimate.sh
+
+ rm -fr LOG TREES
+ mkdir LOG TREES
+ cat << '_EOF_' > template
+#LOOP
+doEstimate.sh $(root1) {check out line+ LOG/$(root1).log} TREES/$(root1)
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ (cd SS; ls *.ss.gz) | sed -e "s/.ss.gz//" > chr.list
+ gensub2 chr.list single template jobList
+ para create jobList
+ para push
+ para time
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 8152s 135.86m 2.26h 0.09d 0.000 y
+# IO & Wait Time: 323s 5.39m 0.09h 0.00d 0.000 y
+# Average job time: 471s 7.85m 0.13h 0.01d
+# Longest finished job: 889s 14.82m 0.25h 0.01d
+# Submission to last job: 895s 14.92m 0.25h 0.01d
+
+ # Now combine parameter estimates.
+ ls TREES/*.cons.mod > cons.txt
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
+ --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
+ ls TREES/*.noncons.mod > noncons.txt
+ /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phyloBoot \
+ --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
+
+ # Now set up cluster job for computing consevation scores and
+ # predicted elements
+ cat << '_EOF_' > doPhastCons.sh
+#!/bin/sh
+
+mkdir -p POSTPROBS ELEMENTS
+chr=$1
+out=$2
+tmpFile=/scratch/tmp/phastCons.$chr
+rm -f $tmpFile
+zcat SS/$chr.ss.gz \
+ | /cluster/bin/phast.build/fromAdam/phast.2008-12-18/bin/phastCons - \
+ ave.cons.mod,ave.noncons.mod --expected-lengths 75 \
+ --target-coverage 0.5 --quiet --seqname $chr --idpref $chr \
+ --viterbi ELEMENTS/$pref.bed --score --require-informative 0 > $tmpFile
+gzip -c $tmpFile > POSTPROBS/$chr.pp.gz
+rm $tmpFile
+'_EOF_'
+ # << happy emacs
+ chmod u+x doPhastCons.sh
+
+ cat << '_EOF_' > run.template
+#LOOP
+doPhastCons.sh $(root1) {check out exists+ POSTPROBS/$(root1).pp.gz} {check out line+ ELEMENTS/$(root1).bed}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 chr.list single run.template run.jobList
+ rm -fr POSTPROBS ELEMENTS
+ para create run.jobList
+ para push
+ para time
+# Completed: 18 of 18 jobs
+# CPU time in finished jobs: 78s 1.30m 0.02h 0.00d 0.000 y
+# IO & Wait Time: 658s 10.97m 0.18h 0.01d 0.000 y
+# Average job time: 41s 0.68m 0.01h 0.00d
+# Longest finished job: 53s 0.88m 0.01h 0.00d
+# Submission to last job: 57s 0.95m 0.02h 0.00d
+
+ # set up phastConsElements track
+ cat ELEMENTS/*.bed | sort -k1,1 -k2,2n | \
+ awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
+ /cluster/bin/scripts/lodToBedScore /dev/stdin \
+ > phastConsElements7way.bed
+
+ hgLoadBed sacCer2 phastConsElements7way phastConsElements7way.bed
+ featureBits sacCer2 phastConsElements7way
+# 7762587 bases of 12162995 (63.821%) in intersection
+ # previously this was:
+ featureBits sacCer1 phastConsElements
+# 7517116 bases of 12156302 (61.837%) in intersection
+
+ mkdir ../downloads
+ mkdir ../downloads/phastCons7way
+ for C in `cat chr.list`
+do
+ cp -p POSTPROBS/${C}.pp.gz \
+ ../downloads/phastCons7way/sacCer2.${C}.wigFixed.gz
+done
+
+ # encode those files into wiggle data
+ zcat ../downloads/phastCons7way/*.wigFixed.gz \
+ | wigEncode stdin phastCons7way.wig phastCons7way.wib
+ # Converted stdin, upper limit 1.00, lower limit 0.00
+
+ # Load gbdb and database with wiggle.
+ cd /hive/data/genomes/sacCer2/bed/multiz7way/phastCons
+ ln -s `pwd`/phastCons7way.wib /gbdb/sacCer2/multiz7way/phastCons7way.wib
+ hgLoadWiggle -pathPrefix=/gbdb/sacCer2/multiz7way sacCer2 \
+ phastCons7way phastCons7way.wig
+
+ # Create histogram to get an overview of all the data
+ hgWiggle -doHistogram \
+ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+ -db=sacCer2 phastCons7way > histogram.data 2>&1
+
+ # create plot of histogram:
+
+ cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Yeast sacCer2 Histogram phastCons7way track"
+set xlabel " phastCons7way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+ # << happy emacs
+
+ display histo.png &