src/hg/makeDb/doc/hg19.txt 1.51
1.51 2009/10/26 17:04:20 hiram
Now working on phyloP business
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.50
retrieving revision 1.51
diff -b -B -U 4 -r1.50 -r1.51
--- src/hg/makeDb/doc/hg19.txt 21 Oct 2009 19:15:36 -0000 1.50
+++ src/hg/makeDb/doc/hg19.txt 26 Oct 2009 17:04:20 -0000 1.51
@@ -5359,9 +5359,9 @@
# all primates placentals
cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
-set PHASTBIN = /cluster/bin/phast/x86_64
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set c = $1
set f = $2
set len = $3
set cov = $4
@@ -5373,13 +5373,11 @@
set ssSrc = $cons
if (-s $cons/$grp/$grp.non-inf) then
ln -s $cons/$grp/$grp.mod $tmp
ln -s $cons/$grp/$grp.non-inf $tmp
- ln -s $ssSrc/ss/$c/$f.ss $tmp
- ln -s $cons/$grp/$grp.mod $tmp
- ln -s $cons/$grp/$grp.non-inf $tmp
+ ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
else
- ln -s $ssSrc/ss/$c/$f.ss $tmp
+ ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
ln -s $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
@@ -5408,13 +5406,15 @@
# this template will serve for all runs
# root1 == chrom name, file1 == ss file name without .ss suffix
cat << '_EOF_' > template
#LOOP
-../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ bed/$(root1)/$(file1).bed}
+../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
#ENDLOOP
'_EOF_'
# << happy emacs
+ ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
+
# Create parasol batch and run it
# run for all species
cd /hive/data/genomes/hg19/bed/multiz46way/cons
mkdir -p all
@@ -5753,15 +5753,14 @@
# Loaded 4785089 elements of size 6
# real 0m58.367s
# verify coverage
featureBits hg19 phastConsElements46wayPlacental
- # 146457699 bases of 2897316137 (5.055%) in intersection
- # 119635433 bases of 2881515245 (4.152%) in intersection
+ # 148816247 bases of 2897316137 (5.136%) in intersection
# --rho 0.3 --expected-length 45 --target-coverage 0.3
featureBits hg19 -enrichment refGene:cds phastConsElements46wayPlacental
- # refGene:cds 1.186%, phastConsElements46wayPlacental 5.055%,
- # both 0.847%, cover 71.42%, enrich 14.13x
+ # refGene:cds 1.186%, phastConsElements46wayPlacental 5.136%,
+ # both 0.864%, cover 72.85%, enrich 14.18x
featureBits hg19 -enrichment knownGene:cds phastConsElements46wayPlacental
# knownGene:cds 1.252%, phastConsElements46wayPlacental 5.055%,
# both 0.865%, cover 69.10%, enrich 13.67x
@@ -5862,8 +5861,70 @@
# << happy emacs
display histo.png &
+#########################################################################
+# phyloP conservation for 46-way (WORKING - 2009-10-21 - Hiram)
+#
+# Vertebrate, Placental, Primates
+#
+ # split SS files into 1M chunks, this business needs smaller files
+ # to complete
+
+ ssh swarm
+ mkdir /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
+ cd /hive/data/genomes/hg19/bed/multiz46way/consPhyloP
+ mkdir ss run.split
+ cd run.split
+
+ cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/hg19/bed/multiz46way/splitRun/maf/hg19_$c.maf
+set WINDOWS = /hive/data/genomes/hg19/bed/multiz46way/consPhyloP/run.split/ss/$c
+set WC = `cat $MAF | wc -l`
+set NL = `grep "^#" $MAF | wc -l`
+if ( -s $2 ) then
+ exit 0
+endif
+if ( -s $2.running ) then
+ exit 0
+endif
+
+date >> $2.running
+
+rm -fr $WINDOWS
+mkdir $WINDOWS
+pushd $WINDOWS > /dev/null
+if ( $WC != $NL ) then
+/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
+ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
+endif
+popd > /dev/null
+date >> $2
+rm -f $2.running
+'EOF'
+# << happy emacs
+
+ ls -1S -r ../../splitRun/maf | sed -e "s/.maf//; s/hg19_//" > maf.list
+
+ cat << '_EOF_' > template
+#LOOP
+doSplit.csh $(path1) {check out exists+ done/$(path1).done}
+#ENDLOOP
+'_EOF_'
+# << happy emacs
+
+ mkdir ss done
+ gensub2 maf.list single template jobList
+ para -ram=8g create jobList
+# Completed: 504 of 504 jobs
+# CPU time in finished jobs: 14486s 241.43m 4.02h 0.17d 0.000 y
+# IO & Wait Time: 306280s 5104.67m 85.08h 3.54d 0.010 y
+# Average job time: 636s 10.61m 0.18h 0.01d
+# Longest finished job: 1635s 27.25m 0.45h 0.02d
+# Submission to last job: 2965s 49.42m 0.82h 0.03d
+
#########################################################################
# LASTZ Zebrafish DanRer6 (DONE - 2009-07-08,10 - Hiram)
mkdir /hive/data/genomes/hg19/bed/lastzDanRer6.2009-07-08
@@ -6755,8 +6816,10 @@
'_EOF_'
# << happy emacs
chmod +x ./mkCmd.sh
+ # this doesn't work on any 32 Gb computer, requires much more memory
+ # turn it into a kluster job, see below
time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames hg19 stdin stdout \
mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
@@ -6855,16 +6918,12 @@
cd /cluster/data/hg19/bed/multiz46way/frames
find ./parts -type f | while read F
do
zcat ${F}
-done | sort -k1,1 -k2,2n | hgLoadMafFrames hg19 multiz46wayFrames stdin
- # real 5m47.840s
-
- find ./parts -type f | while read F
-do
- zcat ${F}
done | sort -k1,1 -k2,2n > multiz46wayFrames.bed
+ hgLoadMafFrames hg19 multiz46wayFrames stdin
+
featureBits -countGaps hg19 multiz46wayFrames.bed
# 57146632 bases of 3137161264 (1.822%) in intersection
# enable the trackDb entries: