src/hg/makeDb/doc/calJac3.txt 1.15
1.15 2010/06/04 23:29:32 chinhli
phastCons support
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac3.txt,v
retrieving revision 1.14
retrieving revision 1.15
diff -b -B -U 4 -r1.14 -r1.15
--- src/hg/makeDb/doc/calJac3.txt 26 May 2010 15:35:31 -0000 1.14
+++ src/hg/makeDb/doc/calJac3.txt 4 Jun 2010 23:29:32 -0000 1.15
@@ -1720,8 +1720,131 @@
ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf/up*.gz .
md5sum up*.gz >> md5sum.txt
+#########################################################################
+# Phylogenetic tree from 13-way (DONE 2010-05-28 - Chin)
+# We need one tree for all chroms
+
+ # use the /hive/data/genomes/calJac3/bed/multiz13way/run/calJac3.13way.maf
+ # to split
+
+ cd /hive/data/genomes/calJac3/bed/multiz13way/
+ mkdir mafSplit
+ cd mafSplit
+ mafSplit -byTarget -useFullSequenceName /dev/null . ../run/calJac3.13way.maf
+ # got 10771 mafs named after their chrom/scaff .maf
+ # although there are over 14205 chroms and scaffolds (wc -l
+ # chrom.sizes),
+ # some are too small or have nothing aligning.
+
+ mkdir /hive/data/genomes/calJac3/bed/multiz13way/4d
+ cd /hive/data/genomes/calJac3/bed/multiz13way/4d
+
+ # calJac3 does not have refGene; but has
+ # 237116 xenoRefGene and 28723 nscanGene.
+ # use nscanGene
+ hgsql calJac3 -Ne \
+ "select * from nscanGene;" \
+ | cut -f 2-20 > nscanGene.gp
+ wc -l nscanGene.gp
+ # 28723 nscanGene.gp
+ # make sure no redundent gene name
+ cat nscanGene.gp | awk '{print $1}' | sort | uniq | wc -l
+ # 28723
+
+ genePredSingleCover nscanGene.gp stdout | sort > nscanGeneNR.gp
+ wc -l nscanGeneNR.gp
+ # 28723 nscanGeneNR.gp
+
+ ssh memk
+ mkdir /hive/data/genomes/calJac3/bed/multiz13way/4d/run
+ cd /hive/data/genomes/calJac3/bed/multiz13way/4d/run
+ mkdir ../mfa
+
+# whole chrom mafs version, using new version of
+# uses memory-efficient version of phast, from Melissa Hubisz at Cornell
+# mjhubisz at gmail.com
+
+ cat << '_EOF_' > 4d.csh
+#!/bin/csh -fe
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set r = "/hive/data/genomes/calJac3/bed/multiz13way"
+set c = $1
+set infile = $r/mafSplit/$2
+set outfile = $3
+cd /scratch/tmp
+# 'clean' maf
+perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf
+awk -v C=$c '$2 == C {print}' $r/4d/nscanGene.gp > $c.gp
+set NL=`wc -l $c.gp| gawk '{print $1}'`
+if ("$NL" != "0") then
+ set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+ $PHASTBIN/msa_view --4d --features $c.gp --do-cats 3 -i MAF $c.maf -o SS > $c.ss
+ $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile
+else
+ echo "" > $r/4d/run/$outfile
+endif
+rm -f $c.gp $c.maf $c.ss
+'_EOF_'
+ # << happy emacs
+ chmod +x 4d.csh
+
+ ls -1S /hive/data/genomes/calJac3/bed/multiz13way/mafSplit/*.maf | \
+ egrep -E -v "chrUn" \
+ | sed -e "s#.*multiz13way/mafSplit/##" \
+ > maf.list
+
+ cat << '_EOF_' > template
+#LOOP
+4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ gensub2 maf.list single template stdout | tac > jobList
+ para create jobList
+ para try ... check ... push ... etc
+ para time
+ # Completed: 2604 of 2604 jobs
+ # CPU time in finished jobs: 2490s 41.49m 0.69h 0.03d 0.000 y
+ # IO & Wait Time: 8357s 139.29m 2.32h 0.10d 0.000 y
+ # Average job time: 4s 0.07m 0.00h 0.00d
+ # Longest finished job: 253s 4.22m 0.07h 0.00d
+ # Submission to last job: 668s 11.13m 0.19h 0.01d
+ # Manually run 4d.csh for chr11_ACFV01197442_random, result in do.log
+
+ # combine mfa files
+ ssh hgwdev
+ cd /hive/data/genomes/calJac3/bed/multiz13way/4d
+ # but first clean out junk 1-byte leftovers from above process.
+ # Only 24 (real chrom) out of 2623 files have real data.
+ cd mfa
+ find -type f -size 1c | xargs -iX rm X
+ find -type f -size 0c | xargs -iX rm X
+ cd /hive/data/genomes/calJac3/bed/multiz13way/4d
+
+ #want comma-less species.list
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/msa_view \
+ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \
+ > 4d.all.mfa
+
+ # fix order in case it helps:
+ #((calJac3,(tetNig2,gasAcu1)),(xenTro2,(mm9,hg19)))
+
+ # use phyloFit to create tree model (output is phyloFit.mod)
+ /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/phyloFit \
+ --EM --precision MED --msa-format FASTA --subst-mod REV \
+ --tree ../tree-commas.nh 4d.all.mfa
+ # Reading alignment from 4d.all.mfa ...
+ # Extracting sufficient statistics ...
+ # Compacting sufficient statistics ...
+ # Fitting tree model to 4d.all.mfa using REV ...
+ # Writing model to phyloFit.mod ...
+ # Done.
+
+
+
#############################################################################
# phastCons 13-way (working 2010-05-24 - Chin)
# was unable to split the full chrom MAF files, now working on the
@@ -1800,39 +1923,8 @@
# 4749
cd ../..
rm *.done
-
-======= Skip begin =======>
- # Estimate phastCons parameters
- # experimented with this as a parasol job on hgwdevnew to try a
- # number
- # of SS files. With a command of:
-
-/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
---tree "(((((((((((((((((calJac3,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
---out-root=$OUT/starting_tree
-
- # running over the input files ../ss/*/*.ss results to
-#.../genomes/calJac3/bed/multiz13way/cons/startingTree/result/*/starting-tree.mod
-
- # add up the C and G:
- find ./result -type f | xargs ls -rt | while read F
-do
- D=`dirname $F`
- echo -n `basename $D`" - "
- grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
-done
- # counting number of species seen in the maf file:
- find ./result -type f | xargs ls -rt | while read F
-do
- D=`dirname $F`
- echo -n `basename $D`" - "
- grep TREE $F | sed -e \
-"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g" | tr ',' '\n' | wc -l
-done
-<======= Skip end ==========
-
# Run phastCons
# This job is I/O intensive in its output files, beware where this
# takes place or do not run too many at once.
ssh swarm
@@ -1841,10 +1933,11 @@
# there are going to be several different phastCons runs using
# this same script. They trigger off of the current working
# directory
- # $cwd:t which is the "grp" in this script. It is one of:
- # all primates placentals
+ # $cwd:t which is the "grp" in this script. It is
+ # all
+
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
@@ -1859,32 +1952,14 @@
set tmp = $cons/tmp/$f
mkdir -p $tmp
set ssSrc = $cons
set useGrp = "$grp.mod"
-if ( $cX == "chrX" ) then
- set useGrp = "$grp.chrX.mod"
-endif
-if (-s $cons/$grp/$grp.non-inf) then
- ln -s $cons/$grp/$grp.mod $tmp
- ln -s $cons/$grp/$grp.chrX.mod $tmp
- ln -s $cons/$grp/$grp.non-inf $tmp
- ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
-else
- ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
- ln -s $cons/$grp/$grp.mod $tmp
- ln -s $cons/$grp/$grp.chrX.mod $tmp
-endif
+ln -s $ssSrc/msa.split/2010-05-24/ss/$c/$f.ss $tmp
+ln -s $cons/$grp/$grp.mod $tmp
pushd $tmp > /dev/null
-if (-s $grp.non-inf) then
- $PHASTBIN/phastCons $f.ss $useGrp \
+$PHASTBIN/phastCons $f.ss $useGrp \
--rho $rho --expected-length $len --target-coverage $cov --quiet \
- --not-informative `cat $grp.non-inf` \
--seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
-else
- $PHASTBIN/phastCons $f.ss $useGrp \
- --rho $rho --expected-length $len --target-coverage $cov --quiet \
- --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
-endif
popd > /dev/null
mkdir -p pp/$c bed/$c
sleep 4
touch pp/$c bed/$c
@@ -1905,138 +1980,131 @@
#ENDLOOP
'_EOF_'
# << happy emacs
- ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
+ ls -1S ../msa.split/2010-05-24/ss/*/* | sed -e 's/.ss$//' > ss.list
# Create parasol batch and run it
# run for all species
cd /hive/data/genomes/calJac3/bed/multiz13way/cons
mkdir -p all
cd all
- # Using the two different .mod tree
- cp -p ../../4dNoX/phyloFit.NoChrX.mod ./all.mod
- cp -p ../../4dX/phyloFit.chrX.mod ./all.chrX.mod
-
+ # Using the .mod tree
+ cp -p ../../4d/phyloFit.mod ./all.mod
+XXXX 06-03
gensub2 ../run.cons/ss.list single ../run.cons/template jobList
para -ram=8g create jobList
para try ... check ... push ... etc.
-# Completed: 581 of 581 jobs
-# CPU time in finished jobs: 41877s 697.95m 11.63h 0.48d
-# 0.001 y
-# IO & Wait Time: 39172s 652.87m 10.88h 0.45d
-# 0.001 y
-# Average job time: 139s 2.32m 0.04h 0.00d
-# Longest finished job: 329s 5.48m 0.09h 0.00d
-# Submission to last job: 2240s 37.33m 0.62h 0.03d
+Completed: 4749 of 4749 jobs
+CPU time in finished jobs: 7452s 124.21m 2.07h 0.09d 0.000 y
+IO & Wait Time: 69529s 1158.81m 19.31h 0.80d 0.002 y
+Average job time: 16s 0.27m 0.00h 0.00d
+Longest finished job: 100s 1.67m 0.03h 0.00d
+Submission to last job: 381s 6.35m 0.11h 0.00d
+
+# Completed: 4273 of 4273 jobs
+# CPU time in finished jobs: 2323s 38.72m 0.65h 0.03d
+# 0.000 y
+# IO & Wait Time: 68627s 1143.78m 19.06h 0.79d
+# 0.002 y
+# Average job time: 17s 0.28m 0.00h 0.00d
+# Longest finished job: 37s 0.62m 0.01h 0.00d
+# Submission to last job: 338s 5.63m 0.09h 0.00d
+
# create Most Conserved track
+ ssh hgwdev
cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
- cut -f1 ../../../../chrom.sizes | while read C
-do
- ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
- do
- cat ${D}/${C}*.bed
- done | sort -k1,1 -k2,2n \
- | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
-done > tmpMostConserved.bed
-/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
+ cat bed/*/*.bed | sort -k1,1 -k2,2n \
+ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
+ > tmpMostConserved.bed
+
+ /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
# load into database
- ssh hgwdev
- cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
- time nice -n +19 hgLoadBed calJac3 phastConsElements13way mostConserved.bed
- # Loaded 5163775 elements of size 6
- # real 1m44.439s
+ nice hgLoadBed calJac3 phastConsElements13way mostConserved.bed
+ # Reading mostConserved.bed
+ # Loaded 1725260 elements of size 5
+ # Sorted
+ # Creating table definition for phastConsElements13way
+ # Saving bed.tab
+ # Loading calJac3
# Try for 5% overall cov, and 70% CDS cov
featureBits calJac3 -enrichment refGene:cds phastConsElements13way
- # --rho 0.3 --expected-length 45 --target-coverage 0.3
- # refGene:cds 1.187%, phastConsElements13way 5.065%,
+ # refGene:cds 0.000%, phastConsElements13way 5.136%,
+ # both 0.000%, cover 72.10%, enrich 14.04x
+
+ # hg19 for comparison
+ # refGene:cds 1.187%, phastConsElements413way 5.065%,
# both 0.884%, cover 74.46%, enrich 14.70x
# Create merged posterier probability file and wiggle track data
# files
cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
mkdir downloads
- cat << '_EOF_' > phastCat.sh
-#!/bin/sh
-mkdir -p downloads
-cut -f1 ../../../../chrom.sizes | while read C
-do
- echo -n "${C} ... working ... "
- ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
- do
- cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
- done | gzip > downloads/${C}.phastCons13way.wigFix.gz
- echo "done"
-done
-'_EOF_'
- # << happy emacs
- chmod +x phastCat.sh
- time nice -n +19 ./phastCat.sh
- # real 30m2.623s
+ cat pp/*/*.pp | sed "s/start=0/start=1/" \
+ | gzip > downloads/phastCons13way.wigFix.gz
+
# encode those files into wiggle data
- zcat downloads/*.wigFix.gz \
+ zcat downloads/phastCons13way.wigFix.gz \
| wigEncode stdin phastCons13way.wig phastCons13way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00
- # real 18m37.881s
+ #Ignore the following comments
+ # Found start=0 at line 1192229703, the first chrom position is 1, not 0
+ # So we have the same problem as danRer6, may need to fix it before reach
+ # this step. Use:
+ # zcat downloads/phastCons13way.wigFix.gz | grep start=0
+ # found only one problem:
+ # fixedStep chrom=chr20_GL285852_random start=0 step=1
+ # Back and fixed pp/chr20_GL285852_random/ first line
+ # fixedStep chrom=chr20_GL285852_random start=0 step=1
+ # or we can use the filter sed "s/start=0/start=1/" in the step above
+
+
+ # Converted stdin, upper limit 1.00, lower limit 0.00
du -hsc *.wi?
- # 2.7G phastCons13way.wib
- # 271M phastCons13way.wig
- # 3.0G total
-
- # encode into a bigWig file:
- # (warning wigToBigWig process grows to about 36 Gb)
- # in bash, to avoid the 32 Gb memory limit:
-sizeG=188743680
-export sizeG
-ulimit -d $sizeG
- zcat downloads/*.wigFix.gz \
- | wigToBigWig stdin ../../../../chrom.sizes phastCons13way.bw
- # real 52m36.142s
-# -rw-rw-r-- 1 21667535139 Oct 20 13:59 phastCons13way.bw
- mkdir /gbdb/calJac3/bbi
- ln -s `pwd`/phastCons13way.bw /gbdb/calJac3/bbi
- # if you wanted to use the bigWig file, loading bigWig table:
- hgsql calJac3 -e 'drop table if exists phastCons13way; \
- create table phastCons13way (fileName varchar(255) not null); \
- insert into phastCons13way values
- ("/gbdb/calJac3/bbi/phastCons13way.bw");'
+ # 2.2G phastCons13way.wib
+ # 257M phastCons13way.wig
+ # 2.4G total
# Load gbdb and database with wiggle.
- ssh hgwdev
- cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
ln -s `pwd`/phastCons13way.wib /gbdb/calJac3/multiz13way/phastCons13way.wib
- time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/calJac3/multiz13way calJac3 \
+ nice hgLoadWiggle -pathPrefix=/gbdb/calJac3/multiz13way calJac3 \
phastCons13way phastCons13way.wig
- # real 1m45.381s
+ # Connected to database calJac3 for track phastCons13way
+ # Creating wiggle table definition in calJac3.phastCons13way
+ # Saving wiggle.tab
+ # Loading calJac3
+
+
+ # use to set trackDb.ra entries for wiggle min and max
wigTableStats.sh calJac3 phastCons13way
-# db.table min max mean count sumData
-# calJac3.phastCons13way 0 1 0.103653 2845303719 2.94924e+08
-# stdDev viewLimits
-# 0.230184 viewLimits=0:1
+ # # db.table min max mean count sumData stdDev viewLimits
+ # calJac3.phastCons13way 0 1 0.128672 2299091936 2.95829e+08 0.24795
+ # viewLimits=0:1
# Create histogram to get an overview of all the data
- ssh hgwdev
- cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
- time nice -n +19 hgWiggle -doHistogram -db=calJac3 \
+ hgWiggle -doHistogram -db=calJac3 \
-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
- phastCons13way > histogram.data 2>&1
- # real 7m37.212s
+ phastCons13way >& histogram.data
+XXXX 06-04 PM stopped here
# create plot of histogram:
+
+
+#orig set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
cat << '_EOF_' | gnuplot > histo.png
-set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
-set title " Human Hg19 Histogram phastCons13way track"
+set title " Zebrafish calJac3 Histogram phastCons13way track"
set xlabel " phastCons13way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]