src/hg/makeDb/doc/calJac3.txt 1.14

1.14 2010/05/26 15:35:31 chinhli
htdocs-hgdownload change
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac3.txt,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 4 -r1.13 -r1.14
--- src/hg/makeDb/doc/calJac3.txt	19 May 2010 22:27:18 -0000	1.13
+++ src/hg/makeDb/doc/calJac3.txt	26 May 2010 15:35:31 -0000	1.14
@@ -1039,8 +1039,10 @@
     #	2135885920 bases of 2829687208 (75.481%) in intersection
 
 #####################################################################
 ## 13-Way Multiz (DONE - 2010-02-23 - Hiram)
+##  (Redo mafSplit and all steps followed working 2010-05-23 - Chin)
+
     mkdir /hive/data/genomes/calJac3/bed/multiz13way
     cd /hive/data/genomes/calJac3/bed/multiz13way
 
     /cluster/bin/phast/tree_doctor \
@@ -1172,8 +1174,9 @@
     ln -s ../../lastz.mm9/axtChain/calJac3.mm9.synNet.maf.gz .
     ln -s ../../lastz.canFam2/axtChain/calJac3.canFam2.synNet.maf.gz .
     ln -s ../../lastz.monDom5/axtChain/calJac3.monDom5.synNet.maf.gz .
 
+XXXX 05-24 mafSplit new option mafSplit -byTarget -useFullSequenceName
     mkdir /hive/data/genomes/calJac3/bed/multiz13way/splitMaf
     cd /hive/data/genomes/calJac3/bed/multiz13way/splitMaf
     for D in gorGor2 tarSyr1 papHam1 otoGar1 micMur1
 do
@@ -1258,8 +1261,10 @@
     grep -h -v "^#" ${F} >> calJac3.13way.maf
 done
     tail -q -n 1 maf/000.maf >> calJac3.13way.maf
 
+XXXX 05-24 split the /hive/data/genomes/calJac3/bed/multiz13way/run
+     calJac3.13way.maf to other folder with fullName for phastCons
     # load tables for a look
     mkdir -p /gbdb/calJac3/multiz13way/maf
     cd /hive/data/genomes/calJac3/bed/multiz13way/run
     ln -s `pwd`/calJac3.13way.maf \
@@ -1283,9 +1288,9 @@
     #   Loading into calJac3 table multiz13waySummary...
     #   Loading complete
     #   real    38m46.339s
 
-    # Gap Annotation
+    # Gap Annotation (DONE 2010-05-12 - chin)
     # prepare bed files with gap info
     mkdir /hive/data/genomes/calJac3/bed/multiz13way/anno
     cd /hive/data/genomes/calJac3/bed/multiz13way/anno
     mkdir maf run
@@ -1508,9 +1513,9 @@
 cd /cluster/home/mary/kent/src/hg/makeDb/trackDb/marmoset
 find -name "*.html" | xargs grep 2007
 
 # downloads
-cd /usr/local/apache/htdocs/goldenPath/calJac3
+cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3
 find -name "*" | xargs grep 2007
 
 # hgcentral
 hgsql hgcentraltest
@@ -1691,4 +1696,360 @@
 # frames multiz13wayFrames
 # irows on
     #   appears to work OK
 
+
+#############################################################################
+## create upstream refGene maf files
+    cd /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf
+    # bash script
+#!/bin/sh
+for S in 1000 2000 5000
+do
+    echo "making upstream${S}.maf"
+    featureBits calJac3 xenoRefGene:upstream:${S} -fa=/dev/null -bed=stdout \
+        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
+        | /cluster/bin/$MACHTYPE/mafFrags calJac3 multiz13way \
+                stdin stdout \
+                -orgs=/hive/data/genomes/calJac3/bed/multiz13way/species.list \
+        | gzip -c > upstream${S}.maf.gz
+    echo "done upstream${S}.maf.gz"
+done
+
+    mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/multiz13way/maf
+    cd /usr/local/apache/htdocs-hgdownload/goldenPath/calJac3/multiz13way/maf
+    ln -s /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf/up*.gz .
+
+    md5sum up*.gz >> md5sum.txt
+
+
+#############################################################################
+# phastCons 13-way (working 2010-05-24 - Chin)
+    #   was unable to split the full chrom MAF files, now working on the
+    #   maf files as they were split up during multiz
+    
+    # re-split the with new -useFullSequenceName option
+    cd /hive/data/genomes/calJac3/bed/multiz13way/run
+    mkdir mafSplit-2010-05-24
+    cd mafSplit-2010-05-24
+    mafSplit -byTarget -useFullSequenceName /dev/null . ../calJac3.13way.maf
+
+    # split 13way mafs into 10M chunks and generate sufficient
+    # statistics 
+    # files for # phastCons
+    ssh swarm
+    mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split
+    mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/2010-05-24/ss
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split
+    
+    cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/calJac3/bed/multiz13way/run/mafSplit-2010-05-24/$c.maf 
+set WINDOWS = /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/2010-05-24/ss/$c
+set WC = `cat $MAF | wc -l`
+set NL = `grep "^#" $MAF | wc -l`
+if ( -s $2 ) then
+    exit 0
+endif
+if ( -s $2.running ) then
+    exit 0
+endif
+
+date >> $2.running
+
+rm -fr $WINDOWS
+mkdir $WINDOWS
+pushd $WINDOWS > /dev/null
+if ( $WC != $NL ) then
+/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
+    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
+popd > /dev/null
+date >> $2
+rm -f $2.running
+'_EOF_'
+    # << happy emacs
+    chmod +x doSplit.csh
+
+    cat << '_EOF_' > template
+#LOOP
+doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    #   do the easy ones first to see some immediate results
+    ls -1S -r ../../run/mafSplit-2010-05-24 \
+        | sed -e "s/.maf//; s/calJac3_//" > maf.list
+
+    gensub2 maf.list single template jobList
+    para -ram=8g create jobList
+    para try ... check ... etc
+    # Completed: 10769 of 10770 jobs
+    # Crashed: 1 jobs (chrUn_GL286217, rerun OK)
+    # CPU time in finished jobs:   4171s      69.52m     1.16h    0.05d  0.000 y
+    # IO & Wait Time:            147369s    2456.15m    40.94h    1.71d  0.005 y
+    # Average job time:                  14s       0.23m     0.00h    0.00d
+    # Longest finished job:             570s       9.50m     0.16h    0.01d
+    # Submission to last job:          1351s      22.52m     0.38h    0.02d
+
+    # some scaffolds were too small to produce output. 
+    # expected 10769, found 4749
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/msa.split/2010-05-24/ss
+    find -type f | wc -l
+    #  4749
+    cd ../..
+    rm *.done
+
+
+======= Skip begin =======>
+    # Estimate phastCons parameters
+    #   experimented with this as a parasol job on hgwdevnew to try a
+    #   number
+    #   of SS files.  With a command of:
+
+/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
+--tree "(((((((((((((((((calJac3,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
+--out-root=$OUT/starting_tree
+
+    #   running over the input files ../ss/*/*.ss results to
+#.../genomes/calJac3/bed/multiz13way/cons/startingTree/result/*/starting-tree.mod
+
+    # add up the C and G: 
+    find ./result -type f | xargs ls -rt | while read F
+do  
+    D=`dirname $F`
+    echo -n `basename $D`" - " 
+    grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
+done
+    #   counting number of species seen in the maf file:
+    find ./result -type f | xargs ls -rt | while read F
+do  
+    D=`dirname $F`
+    echo -n `basename $D`" - "
+    grep TREE $F | sed -e \
+"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
+done
+<======= Skip end ==========
+
+    # Run phastCons
+    #   This job is I/O intensive in its output files, beware where this
+    #   takes place or do not run too many at once.
+    ssh swarm
+    mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/cons/run.cons
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/run.cons
+
+    #   there are going to be several different phastCons runs using
+    #   this same script.  They trigger off of the current working
+    #   directory
+    #   $cwd:t which is the "grp" in this script.  It is one of:
+    #   all primates placentals
+
+    cat << '_EOF_' > doPhast.csh
+#!/bin/csh -fe 
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set c = $1
+set cX = $1:r
+set f = $2
+set len = $3
+set cov = $4
+set rho = $5
+set grp = $cwd:t
+set cons = /hive/data/genomes/calJac3/bed/multiz13way/cons
+set tmp = $cons/tmp/$f
+mkdir -p $tmp
+set ssSrc = $cons
+set useGrp = "$grp.mod"
+if ( $cX == "chrX" ) then
+    set useGrp = "$grp.chrX.mod"
+endif
+if (-s $cons/$grp/$grp.non-inf) then
+  ln -s $cons/$grp/$grp.mod $tmp 
+  ln -s $cons/$grp/$grp.chrX.mod $tmp
+  ln -s $cons/$grp/$grp.non-inf $tmp
+  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
+else 
+  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
+  ln -s $cons/$grp/$grp.mod $tmp 
+  ln -s $cons/$grp/$grp.chrX.mod $tmp
+endif 
+pushd $tmp > /dev/null
+if (-s $grp.non-inf) then
+  $PHASTBIN/phastCons $f.ss $useGrp \ 
+    --rho $rho --expected-length $len --target-coverage $cov --quiet \
+    --not-informative `cat $grp.non-inf` \
+    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+else
+  $PHASTBIN/phastCons $f.ss $useGrp \ 
+    --rho $rho --expected-length $len --target-coverage $cov --quiet \
+    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+endif
+popd > /dev/null
+mkdir -p pp/$c bed/$c
+sleep 4
+touch pp/$c bed/$c
+rm -f pp/$c/$f.pp
+rm -f bed/$c/$f.bed
+mv $tmp/$f.pp pp/$c
+mv $tmp/$f.bed bed/$c
+rm -fr $tmp
+'_EOF_'
+    # << happy emacs
+    chmod a+x doPhast.csh
+
+    #   this template will serve for all runs
+    #   root1 == chrom name, file1 == ss file name without .ss suffix
+    cat << '_EOF_' > template
+#LOOP
+../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
+
+    # Create parasol batch and run it
+    # run for all species
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons
+    mkdir -p all
+    cd all
+    #   Using the two different .mod tree 
+    cp -p ../../4dNoX/phyloFit.NoChrX.mod ./all.mod
+    cp -p ../../4dX/phyloFit.chrX.mod ./all.chrX.mod
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList 
+    para try ... check ... push ... etc.
+# Completed: 581 of 581 jobs
+# CPU time in finished jobs:      41877s     697.95m    11.63h    0.48d
+# 0.001 y
+# IO & Wait Time:                 39172s     652.87m    10.88h    0.45d
+# 0.001 y
+# Average job time:                 139s       2.32m     0.04h    0.00d
+# Longest finished job:             329s       5.48m     0.09h    0.00d
+# Submission to last job:          2240s      37.33m     0.62h    0.03d
+
+    # create Most Conserved track
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
+    cut -f1 ../../../../chrom.sizes | while read C
+do  
+    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.bed
+    done | sort -k1,1 -k2,2n \
+    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
+done > tmpMostConserved.bed
+/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
+
+
+    # load into database
+    ssh hgwdev
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
+    time nice -n +19 hgLoadBed calJac3 phastConsElements13way mostConserved.bed
+    #   Loaded 5163775 elements of size 6
+    #   real     1m44.439s
+
+    # Try for 5% overall cov, and 70% CDS cov 
+    featureBits calJac3 -enrichment refGene:cds phastConsElements13way
+    #   --rho 0.3 --expected-length 45 --target-coverage 0.3
+    #   refGene:cds 1.187%, phastConsElements13way 5.065%,
+    #   both 0.884%, cover 74.46%, enrich 14.70x
+
+    # Create merged posterier probability file and wiggle track data
+    # files
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh 
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+    done | gzip > downloads/${C}.phastCons13way.wigFix.gz
+    echo "done"
+done
+'_EOF_'
+    #   << happy emacs
+    chmod +x phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #   real    30m2.623s
+    
+    #   encode those files into wiggle data
+    zcat downloads/*.wigFix.gz \
+        | wigEncode stdin phastCons13way.wig phastCons13way.wib
+    #   Converted stdin, upper limit 1.00, lower limit 0.00
+    #   real    18m37.881s
+    du -hsc *.wi?
+    #   2.7G    phastCons13way.wib
+    #   271M    phastCons13way.wig
+    #   3.0G    total
+    
+    #   encode into a bigWig file:
+    #   (warning wigToBigWig process grows to about 36 Gb)
+    #   in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons13way.bw
+    #   real    52m36.142s
+# -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons13way.bw
+    mkdir /gbdb/calJac3/bbi
+    ln -s `pwd`/phastCons13way.bw /gbdb/calJac3/bbi
+    #   if you wanted to use the bigWig file, loading bigWig table:
+    hgsql calJac3 -e 'drop table if exists phastCons13way; \
+            create table phastCons13way (fileName varchar(255) not null); \
+            insert into phastCons13way values
+        ("/gbdb/calJac3/bbi/phastCons13way.bw");'
+
+    # Load gbdb and database with wiggle.
+    ssh hgwdev
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
+    ln -s `pwd`/phastCons13way.wib /gbdb/calJac3/multiz13way/phastCons13way.wib
+    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/calJac3/multiz13way calJac3 \
+        phastCons13way phastCons13way.wig
+    #   real    1m45.381s
+
+    wigTableStats.sh calJac3 phastCons13way
+# db.table      min max mean count sumData
+# calJac3.phastCons13way     0 1 0.103653 2845303719 2.94924e+08
+#       stdDev viewLimits
+#       0.230184 viewLimits=0:1
+
+    #  Create histogram to get an overview of all the data
+    ssh hgwdev
+    cd /hive/data/genomes/calJac3/bed/multiz13way/cons/all
+    time nice -n +19 hgWiggle -doHistogram -db=calJac3 \
+        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+            phastCons13way > histogram.data 2>&1
+    #   real    7m37.212s
+
+    #   create plot of histogram:
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg19 Histogram phastCons13way track"
+set xlabel " phastCons13way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics 
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #   << happy emacs
+
+    display histo.png &
+
+
+
+#############################################################################