src/hg/makeDb/doc/felCatV17e.txt 1.16

1.16 2010/05/26 15:35:31 chinhli
htdocs-hgdownload change
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 4 -r1.15 -r1.16
--- src/hg/makeDb/doc/felCatV17e.txt	19 May 2010 22:23:46 -0000	1.15
+++ src/hg/makeDb/doc/felCatV17e.txt	26 May 2010 15:35:31 -0000	1.16
@@ -751,8 +751,10 @@
 
 
 #####################################################################
 ## 6-Way Multiz (DONE - 2010-05-07 - Chin)
+##  (Redo mafSplit and all steps followed working 2010-05-23 - Chin)
+ 
 # use /cluster/home/chinhli/kent/src/hg/utils/phyloTrees/49way.nh 
     mkdir /hive/data/genomes/felCatV17e/bed/multiz6way
     cd /hive/data/genomes/felCatV17e/bed/multiz6way
 
@@ -888,23 +890,23 @@
     ln -s ../../lastz.monDom5/axtChain/felCatV17e.monDom5.synNet.maf.gz .
     # N50 for ailMel1 is 1281781 (less than 10 ~ 20 MB) use rbest
     ln -s ../../lastz.ailMel1/mafRBestNet/felCatV17e.ailMel1.rbest.maf.gz .
 
-
+XXXX 05-24 mafSplit new option mafSplit -byTarget -useFullSequenceName
     # to use rbest or net, use n50.pl chrom.size to tell
     mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/splitMaf
     cd /hive/data/genomes/felCatV17e/bed/multiz6way/splitMaf
     for D in ailMel1
 do
     mkdir ${D}
-    mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \
+    mafSplit -useFullSequenceName -byTarget /dev/null ${D}/ \
         ../singleMafs/felCatV17e.${D}.rbest.maf.gz
 done
 
 for D in hg19 mm9 canFam2 monDom5
 do
     mkdir ${D}
-    mafSplit -useHashedName=8 -byTarget /dev/null ${D}/ \
+    mafSplit -useFullSequenceName -byTarget /dev/null ${D}/ \
         ../singleMafs/felCatV17e.${D}.synNet.maf.gz
 done
 
 
@@ -1302,9 +1304,9 @@
 
 
 ############################################################################
 ## Annotate 6-way multiple alignment with gene annotations
-##              (Done 2010-05-07 - Chin)
+##              (DONE 2010-05-07 - Chin)
     # Gene frames
     ## survey all genomes to see what type of gene track to use
     ## Rules to decide which genes to use in order:
     ##    1. ucscGene
@@ -1403,60 +1405,9 @@
     echo "${DB} done"
 done
 
     #   the following single command doesn't work on any 32 Gb computer,
-    #   requires much more memory, turn it into a kluster job, see below
-    #   ...
-#### Skipped begin ====>
-    #   Create this command with this script:
-    cat << '_EOF_' > mkCmd.sh
-#!/bin/sh
-
-echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \\"
-for G in hg19 mm9
-do  
-    if [ ! -s genes/${G}.gp.gz ]; then
-        echo "missing genes/${G}.gp.gz"
-        exit 255
-    fi
-    echo -n "${G} genes/${G}.gp.gz "
-done 
-echo "\\"
-for D in `sort ensGene.list`
-do
-    if [ ! -s genes/${D}.gp.gz ]; then
-        echo "missing genes/${D}.gp.gz"
-        exit 255
-    fi
-    echo -n "${D} genes/${D}.gp.gz "
-done
-echo "\\"
-for D in `sort xenoRef.list`
-do
-    if [ ! -s genes/${D}.gp.gz ]; then
-        echo "missing genes/${D}.gp.gz"
-        exit 255
-    fi
-    echo -n "${D} genes/${D}.gp.gz "
-done
-echo "\\"
-echo "    | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1"
-'_EOF_'
-    # << happy emacs
-    chmod +x ./mkCmd.sh
-
-    #   this doesn't work on any 32 Gb computer, requires much more
-    #   memory
-    #   turn it into a kluster job, see below
-    time (cat ../run/maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \
-mm9 genes/mm9.gp.gz hg19 genes/hg19.gp.gz ailMel1 genes/ailMel1  \
-canFam2 genes/canFam2.gp.gz monDom5 genes/monDom5 \
-    | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
-#### Skipped end  <====== 
-
-    #   that doesn't work on any 32 Gb computer, requires much more
-    #   memory
-    #   turn it into a kluster job
+    #   requires much more memory, turn it into a kluster job
     ssh swarm
     cd /hive/data/genomes/felCatV17e/bed/multiz6way/frames
     cat << '_EOF_' > runOne
 #!/bin/csh -fe
@@ -1543,13 +1494,336 @@
         | gzip -c > upstream${S}.maf.gz
     echo "done upstream${S}.maf.gz"
 done
 
-    mkdir -p /usr/local/apache/htdocs/goldenPath/felCatV17e/multiz6way/maf
-    cd /usr/local/apache/htdocs/goldenPath/felCatV17e/multiz6way/maf
+    mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/felCatV17e/multiz6way/maf
+    cd /usr/local/apache/htdocs-hgdownload/goldenPath/felCatV17e/multiz6way/maf
     ln -s /hive/data/genomes/felCatV17e/bed/multiz6way/downloads/maf/up*.gz .
     md5sum up*.gz >> md5sum.txt
 
 
+#############################################################################
+# phastCons 6-way (working - 2010-05-20 - Chin)
+    #   was unable to split the full chrom MAF files, now working on the
+    #   maf files as they were split up during multiz
+    
+    # split 6way mafs into 10M chunks and generate sufficient
+    # statistics 
+    # files for # phastCons
+    ssh swarm
+    mkdir -p /hive/data/genomes/felCatV17e/bed/multiz6way/cons/msa.split
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/cons/ss
+    mkdir -p /hive/data/genomes/felCatV17e/bed/multiz6way/cons/msa.split/2010-05-21
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/msa.split/2010-05-21
+    mkdir ss
+    
+    cat << '_EOF_' > doSplit.csh
+#!/bin/csh -ef
+set c = $1
+set MAF = /hive/data/genomes/felCatV17e/bed/multiz6way/anno/maf/*.maf
+set WINDOWS = /hive/data/genomes/felCatV17e/bed/multiz6way/cons/msa.split/2010-05-21/ss/$c
+set WC = `cat $MAF | wc -l`
+set NL = `grep "^#" $MAF | wc -l`
+if ( -s $2 ) then
+    exit 0
+endif
+if ( -s $2.running ) then
+    exit 0
+endif
+
+date >> $2.running
+
+rm -fr $WINDOWS
+mkdir $WINDOWS
+pushd $WINDOWS > /dev/null
+if ( $WC != $NL ) then
+/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
+    $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000
+endif
+popd > /dev/null
+date >> $2
+rm -f $2.running
+'_EOF_'
+    # << happy emacs
+    chmod +x doSplit.csh
+
+    cat << '_EOF_' > template
+#LOOP
+doSplit.csh $(root1) {check out line+ $(root1).done}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    #   do the easy ones first to see some immediate results
+    ls -1S -r ../../../anno/maf | sed -e "s/.maf//; s/felCatV17e_//" > maf.list
+
+    gensub2 maf.list single template jobList
+    para -ram=8g create jobList
+    para try ... check ... etc
+# Completed: 503 of 504 jobs
+# Crashed: 1 jobs
+# CPU time in finished jobs:      14171s     236.18m     3.94h    0.16d
+# 0.000 y
+# IO & Wait Time:                188193s    3136.55m    52.28h    2.18d
+# 0.006 y
+# Average job time:                 402s       6.71m     0.11h    0.00d
+# Longest finished job:            1597s      26.62m     0.44h    0.02d
+# Submission to last job:          2586s      43.10m     0.72h    0.03d
+    #   the one crashed job is felCatV17e_chr18_gl000207_random.00.maf
+
+    #   XXX - this did not work
+    #   this takes a really long time.  memk was down to 2 usable
+    #   machines - got it finished manually on a combination of
+    #   hgwdevnew CPUs
+    #   and other machines
+
+    # Estimate phastCons parameters
+    #   experimented with this as a parasol job on hgwdevnew to try a
+    #   number
+    #   of SS files.  With a command of:
+
+/cluster/bin/phast/x86_64/phyloFit -i SS ${SS} \
+--tree "(((((((((((((((((felCatV17e,panTro2),gorGor1),ponAbe2),rheMac2),calJac1),tarSyr1),(micMur1,otoGar1)),tupBel1),(((((mm9,rn4),dipOrd1),cavPor3),speTri1),(oryCun1,ochPri2))),(((vicPac1,(turTru1,bosTau4)),((equCab2,(felCat3,canFam2)),(myoLuc1,pteVam1))),(eriEur1,sorAra1))),(((loxAfr2,proCap1),echTel1),(dasNov2,choHof1))),monDom4),ornAna1),((galGal3,taeGut1),anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat2)),danRer5)),petMar1)" \
+--out-root=$OUT/starting_tree
 
+    #   running over the input files ../ss/*/*.ss results to
+#.../genomes/felCatV17e/bed/multiz6way/cons/startingTree/result/*/starting-tree.mod
 
-####################################################################
+    # add up the C and G: 
+    find ./result -type f | xargs ls -rt | while read F
+do  
+    D=`dirname $F`
+    echo -n `basename $D`" - " 
+    grep BACKGROUND ${F} | awk '{printf "%0.3f\n", $3 + $4;}'
+done
+    #   counting number of species seen in the maf file:
+    find ./result -type f | xargs ls -rt | while read F
+do  
+    D=`dirname $F`
+    echo -n `basename $D`" - "
+    grep TREE $F | sed -e \
+"s/TREE: //; s/(//g; s/)//g; s/[0-9].[0-9][0-9][0-9][0-9][0-9][0-9]//g; s/://g"  | tr ',' '\n' | wc -l
+done
+
+
+    # Run phastCons
+    #   This job is I/O intensive in its output files, beware where this
+    #   takes place or do not run too many at once.
+    ssh swarm
+    mkdir -p /hive/data/genomes/felCatV17e/bed/multiz6way/cons/run.cons
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/run.cons
+
+    #   there are going to be several different phastCons runs using
+    #   this same script.  They trigger off of the current working
+    #   directory
+    #   $cwd:t which is the "grp" in this script.  It is one of:
+    #   all primates placentals
+
+    cat << '_EOF_' > doPhast.csh
+#!/bin/csh -fe 
+set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
+set c = $1
+set cX = $1:r
+set f = $2
+set len = $3
+set cov = $4
+set rho = $5
+set grp = $cwd:t
+set cons = /hive/data/genomes/felCatV17e/bed/multiz6way/cons
+set tmp = $cons/tmp/$f
+mkdir -p $tmp
+set ssSrc = $cons
+set useGrp = "$grp.mod"
+if ( $cX == "chrX" ) then
+    set useGrp = "$grp.chrX.mod"
+endif
+if (-s $cons/$grp/$grp.non-inf) then
+  ln -s $cons/$grp/$grp.mod $tmp 
+  ln -s $cons/$grp/$grp.chrX.mod $tmp
+  ln -s $cons/$grp/$grp.non-inf $tmp
+  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
+else 
+  ln -s $ssSrc/msa.split/2009-10-21/ss/$c/$f.ss $tmp
+  ln -s $cons/$grp/$grp.mod $tmp 
+  ln -s $cons/$grp/$grp.chrX.mod $tmp
+endif 
+pushd $tmp > /dev/null
+if (-s $grp.non-inf) then
+  $PHASTBIN/phastCons $f.ss $useGrp \ 
+    --rho $rho --expected-length $len --target-coverage $cov --quiet \
+    --not-informative `cat $grp.non-inf` \
+    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+else
+  $PHASTBIN/phastCons $f.ss $useGrp \ 
+    --rho $rho --expected-length $len --target-coverage $cov --quiet \
+    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
+endif
+popd > /dev/null
+mkdir -p pp/$c bed/$c
+sleep 4
+touch pp/$c bed/$c
+rm -f pp/$c/$f.pp
+rm -f bed/$c/$f.bed
+mv $tmp/$f.pp pp/$c
+mv $tmp/$f.bed bed/$c
+rm -fr $tmp
+'_EOF_'
+    # << happy emacs
+    chmod a+x doPhast.csh
+
+    #   this template will serve for all runs
+    #   root1 == chrom name, file1 == ss file name without .ss suffix
+    cat << '_EOF_' > template
+#LOOP
+../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    ls -1S ../msa.split/2009-10-21/ss/chr*/chr* | sed -e "s/.ss$//" > ss.list
+
+    # Create parasol batch and run it
+    # run for all species
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons
+    mkdir -p all
+    cd all
+    #   Using the two different .mod tree 
+    cp -p ../../4dNoX/phyloFit.NoChrX.mod ./all.mod
+    cp -p ../../4dX/phyloFit.chrX.mod ./all.chrX.mod
+
+    gensub2 ../run.cons/ss.list single ../run.cons/template jobList
+    para -ram=8g create jobList 
+    para try ... check ... push ... etc.
+# Completed: 581 of 581 jobs
+# CPU time in finished jobs:      41877s     697.95m    11.63h    0.48d
+# 0.001 y
+# IO & Wait Time:                 39172s     652.87m    10.88h    0.45d
+# 0.001 y
+# Average job time:                 139s       2.32m     0.04h    0.00d
+# Longest finished job:             329s       5.48m     0.09h    0.00d
+# Submission to last job:          2240s      37.33m     0.62h    0.03d
+
+    # create Most Conserved track
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/all
+    cut -f1 ../../../../chrom.sizes | while read C
+do  
+    ls -d bed/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.bed
+    done | sort -k1,1 -k2,2n \
+    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}'
+done > tmpMostConserved.bed
+/cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed
+
+
+    # load into database
+    ssh hgwdev
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/all
+    time nice -n +19 hgLoadBed felCatV17e phastConsElements6way mostConserved.bed
+    #   Loaded 5163775 elements of size 6
+    #   real     1m44.439s
+
+    # Try for 5% overall cov, and 70% CDS cov 
+    featureBits felCatV17e -enrichment refGene:cds phastConsElements6way
+    #   --rho 0.3 --expected-length 45 --target-coverage 0.3
+    #   refGene:cds 1.187%, phastConsElements6way 5.065%,
+    #   both 0.884%, cover 74.46%, enrich 14.70x
+
+    # Create merged posterier probability file and wiggle track data
+    # files
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/all
+    mkdir downloads
+    cat << '_EOF_' > phastCat.sh 
+#!/bin/sh
+
+mkdir -p downloads
+cut -f1 ../../../../chrom.sizes | while read C
+do
+    echo -n "${C} ... working ... "
+    ls -d pp/${C}.[0-9][0-9] 2> /dev/null | while read D
+    do
+        cat ${D}/${C}*.pp | sed -e "s/chrom=${C}.[0-9][0-9]/chrom=${C}/"
+    done | gzip > downloads/${C}.phastCons6way.wigFix.gz
+    echo "done"
+done
+'_EOF_'
+    #   << happy emacs
+    chmod +x phastCat.sh
+    time nice -n +19 ./phastCat.sh
+    #   real    30m2.623s
+    
+    #   encode those files into wiggle data
+    zcat downloads/*.wigFix.gz \
+        | wigEncode stdin phastCons6way.wig phastCons6way.wib
+    #   Converted stdin, upper limit 1.00, lower limit 0.00
+    #   real    18m37.881s
+    du -hsc *.wi?
+    #   2.7G    phastCons6way.wib
+    #   271M    phastCons6way.wig
+    #   3.0G    total
+    
+    #   encode into a bigWig file:
+    #   (warning wigToBigWig process grows to about 36 Gb)
+    #   in bash, to avoid the 32 Gb memory limit:
+sizeG=188743680
+export sizeG
+ulimit -d $sizeG
+ulimit -v $sizeG
+    zcat downloads/*.wigFix.gz \
+        | wigToBigWig stdin ../../../../chrom.sizes phastCons6way.bw
+    #   real    52m36.142s
+# -rw-rw-r--   1 21667535139 Oct 20 13:59 phastCons6way.bw
+    mkdir /gbdb/felCatV17e/bbi
+    ln -s `pwd`/phastCons6way.bw /gbdb/felCatV17e/bbi
+    #   if you wanted to use the bigWig file, loading bigWig table:
+    hgsql felCatV17e -e 'drop table if exists phastCons6way; \
+            create table phastCons6way (fileName varchar(255) not null); \
+            insert into phastCons6way values
+        ("/gbdb/felCatV17e/bbi/phastCons6way.bw");'
+
+    # Load gbdb and database with wiggle.
+    ssh hgwdev
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/all
+    ln -s `pwd`/phastCons6way.wib /gbdb/felCatV17e/multiz6way/phastCons6way.wib
+    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/felCatV17e/multiz6way felCatV17e \
+        phastCons6way phastCons6way.wig
+    #   real    1m45.381s
+
+    wigTableStats.sh felCatV17e phastCons6way
+# db.table      min max mean count sumData
+# felCatV17e.phastCons6way     0 1 0.103653 2845303719 2.94924e+08
+#       stdDev viewLimits
+#       0.230184 viewLimits=0:1
+
+    #  Create histogram to get an overview of all the data
+    ssh hgwdev
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/cons/all
+    time nice -n +19 hgWiggle -doHistogram -db=felCatV17e \
+        -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
+            phastCons6way > histogram.data 2>&1
+    #   real    7m37.212s
+
+    #   create plot of histogram:
+    cat << '_EOF_' | gnuplot > histo.png
+set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
+set size 1.4, 0.8
+set key left box
+set grid noxtics
+set grid ytics
+set title " Human Hg19 Histogram phastCons6way track"
+set xlabel " phastCons6way score"
+set ylabel " Relative Frequency"
+set y2label " Cumulative Relative Frequency (CRF)"
+set y2range [0:1]
+set y2tics 
+set yrange [0:0.02]
+
+plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
+        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
+'_EOF_'
+    #   << happy emacs
+
+    display histo.png &
+
+
+
+#############################################################################