src/hg/makeDb/doc/monDom5.txt 1.16
1.16 2009/09/28 20:49:09 aamp
Added multiz/phastCons make instructions.
Index: src/hg/makeDb/doc/monDom5.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/monDom5.txt,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 4 -r1.15 -r1.16
--- src/hg/makeDb/doc/monDom5.txt 20 Sep 2009 17:16:45 -0000 1.15
+++ src/hg/makeDb/doc/monDom5.txt 28 Sep 2009 20:49:09 -0000 1.16
@@ -988,4 +988,456 @@
svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13
see doc/builds.txt for specific details.
############################################################################
+
+ssh hgwdev
+mkdir /hive/data/genomes/monDom5/bed/multiz9way
+cd /hive/data/genomes/monDom5/bed/multiz9way
+## Build the tree by pruning out the 8 species from the recent 44way,
+## changing monDom4 to monDom5, and adding macEug1
+## (Wallaby placement deduced from (Margulies et al.,PNAS 2004) ).
+/cluster/bin/phast/tree_doctor \
+ --prune-all-but=hg18,mm9,canFam2,ornAna1,galGal3,xenTro2,danRer5,monDom4 \
+ /hive/data/genomes/hg18/bed/multiz44way/44way.nh | \
+ sed 's/:0\.[[:digit:]]\+/ /g; s/,//g; s/;//; s/\ )/)/g; s/monDom4/(monDom5 macEug1)/' \
+ > tree.nh
+sed 's/(//g; s/)//g' tree.nh > species.lst
+
+## Make per-chrom maf links
+mkdir mafLinks
+cd mafLinks
+bash
+for db in hg18 mm9 canFam2 macEug1 ornAna1 galGal3 xenTro2 danRer5; do
+ mkdir $db; pushd $db;
+ if [ -e /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet ]; then
+ ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafSynNet/* .
+ else
+ ln -s /hive/data/genomes/monDom5/bed/blastz.${db}/mafNet/* .
+ fi
+ popd
+done
+cd ../
+
+## Split mafs up
+mkdir mafSplit
+mafSplitPos -minGap=50000 monDom5 10 mafSplit.bed
+for db in `ls -1 mafLinks`; do
+ mkdir mafSplit/$db
+ pushd mafSplit/$db
+ mafSplit ../../mafSplit.bed monDom5_ ../../mafLinks/${db}/chr*.maf.gz \
+ -verbose=2
+ popd
+done
+
+## Make file list for cluster run
+cd mafSplit/canFam2/
+find . -type f | sort -u | sed -e "s#./##" > ../../9-way.split.lst
+## Double-check the file sets are all the same in each database subdir
+
+## Cluster run on split mafs
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way
+mkdir -p multizRun/run/penn multizRun/maf
+cd multizRun/run/
+cp -p /cluster/bin/penn/multiz.2008-11-25/{multiz,maf_project,autoMZ} penn/
+cat > autoMultiz.csh << '_EOF_'
+#!/bin/csh -ef
+set db = monDom5
+set c = $1
+set result = $2
+set run = `pwd`
+set tmp = $run/tmp/$db/multiz.$c
+set pairs = /hive/data/genomes/monDom5/bed/multiz9way/mafSplit
+/bin/rm -fr $tmp
+/bin/mkdir -p $tmp
+/bin/cp -p ../../tree.nh ../../species.lst $tmp
+pushd $tmp
+foreach s (`sed -e "s/ $db//" species.lst`)
+ set in = $pairs/$s/$c.maf
+ set out = $db.$s.sing.maf
+ if (-e $in.gz) then
+ /bin/zcat $in.gz > $out
+ else if (-e $in) then
+ ln -s $in $out
+ else
+ echo "##maf version=1 scoring=autoMZ" > $out
+ endif
+end
+set path = ($run/penn $path); rehash
+$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
+popd
+/bin/rm -f $result
+/bin/cp -p $tmp/$c.maf $result
+/bin/rm -fr $tmp
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp/$db
+/bin/rmdir --ignore-fail-on-non-empty $run/tmp
+'_EOF_'
+# << emacs
+chmod +x autoMultiz.csh
+cat << '_EOF' > gsub
+#LOOP
+./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/monDom5/bed/multiz9way/multizRun/maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+# << emacs
+para -ram=4g -cpu=2 create jobList
+para push
+para time
+#Completed: 332 of 332 jobs
+#CPU time in finished jobs: 17639s 293.98m 4.90h 0.20d 0.001 y
+#IO & Wait Time: 46958s 782.64m 13.04h 0.54d 0.001 y
+#Average job time: 195s 3.24m 0.05h 0.00d
+#Longest finished job: 1693s 28.22m 0.47h 0.02d
+#Submission to last job: 1706s 28.43m 0.47h 0.02d
+
+## Sew up the mafs into one file
+ssh hgwdev
+cd /hive/data/genomes/hg18/bed/multiz44way/multizRun
+mkdir ../maf
+ls -1 maf | sed 's/monDom5_//; s/\..*//' | sort -u | while read chrom; do
+ head -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u > ../maf/${chrom}.maf
+ grep -h "^#" maf/monDom5_${chrom}.*.maf | egrep -v "maf version=1|eof maf" | sed -e "s#${chrom}.[0-9][0-9]*#${chrom}#g; s#_MZ_[^ ]* # #g;" | sort -u >> ../maf/${chrom}.maf
+ grep -h -v "^#" `ls maf/monDom5_${chrom}.*.maf | sort -t. -k2,2n` >> ../maf/${chrom}.maf
+ tail -q -n 1 maf/monDom5_${chrom}.*.maf | sort -u >> ../maf/${chrom}.maf
+ echo done with $chrom
+done
+cd ../
+rm -rf multizRun/maf/ mafSplit/
+
+## gap annotation
+mkdir -p anno/{maf,run}
+cd anno/
+for db in `cat ../species.lst`; do
+ dbDir="/hive/data/genomes/${db}"
+ if [ ! -f ${dbDir}/${db}.N.bed ]; then
+ echo "creating ${db}.N.bed"
+ twoBitInfo -nBed ${dbDir}/${db}.2bit ${dbDir}/${db}.N.bed
+ else
+ ls -og ${dbDir}/${db}.N.bed
+ fi
+done
+cd run/
+for db in `sed -e "s/monDom5 //" ../../species.lst`; do
+ echo "${db} "
+ ln -s /hive/data/genomes/${db}/${db}.N.bed ${db}.bed
+ echo ${db}.bed >> nBeds
+ ln -s /hive/data/genomes/${db}/chrom.sizes ${db}.len
+ echo ${db}.len >> sizes
+done
+ssh hgwdevnew
+cd /hive/data/genomes/monDom5/bed/multiz9way/run
+for c in `ls -1 ../../maf | sed -e "s/.maf//"`; do
+ echo starting $c
+ time nice mafAddIRows -nBeds=nBeds ../../maf/${c}.maf /hive/data/genomes/monDom5/monDom5.2bit ../maf/${c}.maf
+done
+exit # hgwdevnew
+cd ../../
+
+## quality information (only available for 6 of 9 species including monDom5)
+mkdir -p quals/{maf,qa}
+cd quals/qa/
+ln -s /hive/data/genomes/monDom5/monDom5.{qac,qdx} .
+ln -s /hive/data/genomes/hg18/bed/multiz44way/quals/allInOnePlace/{galGal3,canFam2,ornAna1}.* .
+qaToQac /hive/data/genomes/rn5/baylor/Rnor4.chroms.fa.qual.gz \
+ rn5.noIdx.qac
+qacAddGapIdx /hive/data/genomes/rn5/baylor/Rnor4.scaffold_chr.agp \
+ rn5.noIdx.qac rn5.qac rn5.qdx
+rm rn5.noIdx.qac
+qaToQac /hive/data/genomes/macEug1/baylor/macEug1.contigs.onlyInAgp.qa.gz \
+ macEug1.noIdx.qac
+qacAddGapIdx /hive/data/genomes/macEug1/baylor/macEug1.agp macEug1.noIdx.qac \
+ macEug1.qac macEug1.qdx
+for db in canFam2 galGal3 macEug1 monDom5 ornAna1 rn5; do
+ echo $db | awk 'BEGIN{OFS="\t"}{print $1, "/hive/data/genomes/monDom5/bed/multiz9way/quals/qa";}';
+done > quals.lst
+cat << '_EOF_' > gsub
+#LOOP
+mafAddQRows quals.lst $(path1) {check out line+ maf/$(file1) }
+#ENDLOOP
+'_EOF_'
+# << emacs
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way/quals
+mkdir inMaf
+cd inMaf/
+ln -s ../../anno/maf/* .
+cd ../
+ls -1 inMaf/* > in.lst
+gensub2 in.lst single gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+#Completed: 11 of 11 jobs
+#CPU time in finished jobs: 5419s 90.32m 1.51h 0.06d 0.000 y
+#IO & Wait Time: 19745s 329.08m 5.48h 0.23d 0.001 y
+#Average job time: 2288s 38.13m 0.64h 0.03d
+#Longest finished job: 5196s 86.60m 1.44h 0.06d
+#Submission to last job: 5206s 86.77m 1.45h 0.06d
+cd ../
+
+## Gene frames
+mkdir -p genes/{genePreds,inMaf,chrFrames}
+cd genes/genePreds/
+## human/mouse use known genes
+for db in hg18 mm9; do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${db} | \
+ genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz
+done
+## monDom5/canFam2/ornAna1/galGal3/xenTro2/danRer5 use ensGene
+for db in monDom5 canFam2 ornAna1 galGal3 xenTro2 danRer5; do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" $db | \
+ genePredSingleCover stdin stdout | gzip -c > ${db}.gp.gz
+done
+cd ../inMaf/
+ln -s ../../quals/maf/* .
+cd ../
+ls -1 inMaf/ | sed 's/.maf//' > chr.lst
+ls -1 genePreds/ | sed 's/.gp.gz//' > gp.lst
+cat << '_EOF_' > mafFrames.csh
+#!/bin/csh -fe
+set c = $1
+set g = $2
+cat inMaf/${c}.maf | genePredToMafFrames monDom5 stdin stdout \
+ ${g} genePreds/${g}.gp.gz | gzip -c > chrFrames/${c}.${g}.mafFrames.gz
+'_EOF_'
+cat << '_EOF_' > gsub
+#LOOP
+./mafFrames.csh $(root1) $(root2) {check out exists+ chrFrames/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way/genes
+gensub2 chr.lst gp.lst gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+#Completed: 88 of 88 jobs
+#CPU time in finished jobs: 1610s 26.83m 0.45h 0.02d 0.000 y
+##IO & Wait Time: 3101s 51.69m 0.86h 0.04d 0.000 y
+#Average job time: 54s 0.89m 0.01h 0.00d
+#Longest finished job: 95s 1.58m 0.03h 0.00d
+#Submission to last job: 211s 3.52m 0.06h 0.00d
+exit # [exit swarm]
+find chrFrames -type f | while read frameFile; do
+ zcat ${frameFile}
+done | sort -k1,1 -k2,2n > multiz9wayFrames.bed
+hgLoadMafFrames monDom5 multiz9wayFrames{,.bed}
+
+## make downloads
+mkdir -p download/maf
+cd download/maf
+cp -p ../../quals/maf/chr*.maf .
+gzip --rsyncable *.maf
+md5sum *.gz > md5sum.txt
+mkdir -p /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf
+ln -s `pwd -P`/* /usr/local/apache/htdocs/goldenPath/monDom5/multiz9way/maf
+cd ../../
+
+## load mafs into database
+cd anno/maf
+mkdir -p /gbdb/monDom5/multiz9way/maf
+ln -s `pwd -P`/*.maf /gbdb/monDom5/multiz9way/maf
+pushd /data/tmp
+nice -n +19 hgLoadMaf -pathPrefix=/gbdb/monDom5/multiz9way/maf monDom5 multiz9way
+nice -n +19 cat /gbdb/monDom5/multiz9way/maf/*.maf | hgLoadMafSummary monDom5 multiz9waySummary stdin
+popd
+cd ../../
+
+## phastCons
+## 1. collect all mafs with all nine species in components.
+mkdir cons
+cd cons/
+for chrom in `ls -1 ../initialMafs/`; do
+ for db in `cat ../species.lst`; do
+ maf=../initialMafs/$chrom
+ if [ -e withAllNine.$chrom ]; then
+ maf=withAllNine.$chrom
+ fi
+ mafFilter -needComp=$db $maf > tmp.maf
+ mv tmp.maf withAllNine.$chrom
+ done
+ echo Done with withAllNine.$chrom
+done
+## 2. split into 10 Mbase pieces and generate .ss files
+mkdir msa.split ss
+cd msa.split/
+ln -s ../../../../monDom5.2bit
+cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set maf = /hive/data/genomes/monDom5/bed/multiz9way/cons/maf/withAllNine.${c}.maf
+set chromDir = /hive/data/genomes/monDom5/bed/multiz9way/cons/ss/$c
+rm -fr $chromDir
+mkdir $chromDir
+pushd $chromDir > /dev/null
+twoBitToFa -seq=$c monDom5.2bit monDom5.${c}.fa
+msa_split $maf -i MAF \
+ -M monDom5.${c}.fa -o SS -r $chromDir/$c -w 20000000,0 -I 100 -B 5000
+rm -f monDom5.${c}.fa
+popd > /dev/null
+date >> ${c}.done
+'_EOF_'
+ # << happy emacs
+ chmod +x doSplit.csh
+ cat << '_EOF_' > template
+#LOOP
+./doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa.split
+ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst
+gensub2 chr.lst single gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+#Completed: 11 of 11 jobs
+#CPU time in finished jobs: 805s 13.42m 0.22h 0.01d 0.000 y
+#IO & Wait Time: 367s 6.11m 0.10h 0.00d 0.000 y
+#Average job time: 107s 1.78m 0.03h 0.00d
+#Longest finished job: 211s 3.52m 0.06h 0.00d
+#Submission to last job: 220s 3.67m 0.06h 0.00d
+cd ../
+## another run, full chroms
+mkdir ss.chrom msa_view.run
+cd msa_view.run
+ln -s ../../../../monDom5.2bit
+cat << '_EOF_' > doChrom.csh
+#!/bin/csh -ef
+set c = $1
+twoBitToFa -seq=$c monDom5.2bit ${c}.fa
+msa_view ../maf/withAllNine.${c}.maf -i MAF -M ${c}.fa -o SS > ../ss.chrom/${c}.ss
+rm ${c}.fa
+'_EOF_'
+ # << emacs
+cat << '_EOF_' > gsub
+#LOOP
+./doChrom.csh $(root1) {check out line+ ../ss.chrom/$(root1).ss }
+#ENDLOOP
+'_EOF_'
+ls -1 ../maf/ | sed 's/withAllNine.//; s/.maf//' > chr.lst
+gensub2 chr.lst single gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+#Completed: 11 of 11 jobs
+#CPU time in finished jobs: 769s 12.81m 0.21h 0.01d 0.000 y
+#IO & Wait Time: 426s 7.10m 0.12h 0.00d 0.000 y
+#Average job time: 109s 1.81m 0.03h 0.00d
+#Longest finished job: 211s 3.52m 0.06h 0.00d
+#Submission to last job: 217s 3.62m 0.06h 0.00d
+mkdir phyloFit.run
+cd phyloFit.run/
+ls -1 ../ss.chrom/* > chr.lst
+cat << '_EOF_' > gsub
+#LOOP
+/cluster/bin/phast/x86_64/phyloFit --msa-format SS --tree start.tree $(path1) { check out line+ ../trees/$(root1).mod }
+#ENDLOOP
+'_EOF_'
+gensub2 chr.lst single gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+cd ../trees/
+phyloBoot --read-mods chr{1,2,3,4,5,6,7,8}.mod --output-average chroms1-8.mod
+
+######## Let's back up a minute
+
+exit # back to hgwdev
+mkdir msa_split.chr1
+cd msa_split.chr1/
+twoBitToFa ../../../../monDom5.2bit:chr1 chr1.fa
+time msa_split ../maf/withAllNine.chr1.maf --refseq chr1.fa --in-format MAF \
+ --windows 100000000,1000 --out-format SS \
+ --between-blocks 5000 --out-root ssChr1
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way/cons/phastCons.estimateTrees.run
+ls -1 ../msa_split.chr1/* > ss.lst
+grep BACKGROUND ../trees/chroms1-8.mod | awk '{printf "%0.3f\n", $3 + $4;}'
+#0.383
+cat << '_EOF_' > gsub
+#LOOP
+/cluster/bin/phast/x86_64/phastCons --gc 0.383 --target-coverage 0.17 --expected-length 12 --estimate-trees --no-post-probs trees/$(root1) $(path1) ../trees/chroms1-8.mod
+#ENDLOOP
+'_EOF_'
+gensub2 ss.lst single gsub jobList
+para -ram=8g -cpu=4 create jobList
+para push
+para time
+#Completed: 8 of 8 jobs
+#CPU time in finished jobs: 51655s 860.91m 14.35h 0.60d 0.002 y
+#IO & Wait Time: 950s 15.84m 0.26h 0.01d 0.000 y
+#Average job time: 6576s 109.59m 1.83h 0.08d
+#Longest finished job: 8998s 149.97m 2.50h 0.10d
+#Submission to last job: 9217s 153.62m 2.56h 0.11d
+
+## OK now average these
+cd ../
+ls trees/*.cons.mod > cons.txt
+phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod
+ls trees/*.noncons.mod > noncons.txt
+phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod
+
+## Now break the big mafs into .ss files
+
+cd ../
+mkdir msa_split.bigMafs ss.splits
+cd msa_split.bigMafs/
+ls -1 ../../initialMafs/* > mafs.lst
+cat << '_EOF_' > msa.sh
+#!/bin/bash
+chr=`basename $1 .maf`
+mkdir -p $chr
+twoBitToFa ../../../../monDom5.2bit:$chr ${chr}.fa
+/cluster/bin/phast/x86_64/msa_split $1 -M ${chr}.fa -i MAF -o SS -w 10000000,1000 -B 5000 -r ${chr}/ss
+rm ${chr}.fa
+'_EOF_'
+chmod +x msa.sh
+cat << '_EOF_' > gsub
+#LOOP
+./msa.sh $(path1) { check exists $(root1) }
+#ENDLOOP
+'_EOF_'
+ssh swarm
+cd /hive/data/genomes/monDom5/bed/multiz9way/cons/msa_split.bigMafs
+gensub2 mafs.lst single gsub spec
+para create spec
+para push
+para time
+
+## Run phastCons on the many .ss files
+
+mkdir ../phast.run
+cd ../phast.run/
+find ../msa_split.bigMafs/ -type f -name '*.ss' > ss.lst
+ln -s ../phastCons.estimateTrees.run/*.mod .
+cat << '_EOF_' > phast.sh
+#!/bin/bash
+ss=$1
+wig=$2
+bed=$3
+mkdir -p `dirname $wig`
+mkdir -p `dirname $bed`
+/cluster/bin/phast/x86_64/phastCons --target-coverage 0.17 --expected-length 12 --most-conserved $bed --score $ss ave.cons.mod,ave.noncons.mod > $wig
+'_EOF_'
+chmod +x phast.sh
+cat << '_EOF_' > gsub
+#LOOP
+./phast.sh $(path1) wig/$(lastDir1)/$(root1).wig bed/$(lastDir1)/$(root1).bed
+#ENDLOOP
+'_EOF_'
+gensub2 ss.lst single gsub spec
+para create spec
+para push
+para time
+#Completed: 367 of 367 jobs
+#CPU time in finished jobs: 9131s 152.18m 2.54h 0.11d 0.000 y
+#IO & Wait Time: 6169s 102.82m 1.71h 0.07d 0.000 y
+#Average job time: 42s 0.69m 0.01h 0.00d
+#Longest finished job: 70s 1.17m 0.02h 0.00d
+#Submission to last job: 343s 5.72m 0.10h 0.00d
+
+## Stitch up wig files, make .wibs, load in DB
+
+