src/hg/makeDb/doc/felCatV17e.txt 1.13

1.13 2010/05/07 17:20:37 chinhli
Complete multiz6way
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 4 -r1.12 -r1.13
--- src/hg/makeDb/doc/felCatV17e.txt	5 May 2010 14:56:10 -0000	1.12
+++ src/hg/makeDb/doc/felCatV17e.txt	7 May 2010 17:20:37 -0000	1.13
@@ -750,9 +750,9 @@
     #   reset to hguser in .hg.conf.beta
 
 
 #####################################################################
-## 6-Way Multiz (working - 2010-04-19 - Chin)
+## 6-Way Multiz (DONE - 2010-04-19 - Chin)
 # use /cluster/home/chinhli/kent/src/hg/utils/phyloTrees/49way.nh 
     mkdir /hive/data/genomes/felCatV17e/bed/multiz6way
     cd /hive/data/genomes/felCatV17e/bed/multiz6way
 
@@ -980,35 +980,155 @@
 do
     grep -h -v "^#" ${F} >> felCatV17e.6way.maf
 done
     tail -q -n 1 maf/000.maf >> felCatV17e.6way.maf
-    tail -q -n 1 maf/hg19_${C}.*.maf | sort -u >> ../maf/${C}.maf
 
     #   real    13m32.340s
 
     # load tables for a look
     mkdir -p /gbdb/felCatV17e/multiz6way/maf
-    cd /hive/data/genomes/felCatV17e/bed/multiz6way/maf
+    # cd /hive/data/genomes/felCatV17e/bed/multiz6way/maf
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/run
+
     ln -s `pwd`/felCatV17e.6way.maf \
         /gbdb/felCatV17e/multiz6way/maf/multiz6way.maf
 
     # this generates an immense multiz6way.tab file in the directory
     #   where it is running.  Best to run this over in scratch.
     cd /data/tmp
     time nice -n +19 hgLoadMaf \
         -pathPrefix=/gbdb/felCatV17e/multiz6way/maf felCatV17e multiz6way
-    #   Loaded 13316945 mafs in 1 files from
-    #   /gbdb/felCatV17e/multiz6way/maf
-    #   real    9m9.365s
+    # Indexing and tabulating /gbdb/felCatV17e/multiz6way/maf/multiz6way.maf
+    # Loading multiz6way into database
+    # Loaded 5500990 mafs in 1 files from /gbdb/felCatV17e/multiz6way/maf
+    # 
+    # real    5m56.770s
 
     # load summary table
     time nice -n +19 cat /gbdb/felCatV17e/multiz6way/maf/*.maf \
         | hgLoadMafSummary felCatV17e -minSize=30000 -verbose=2 \
                 -mergeGap=1500 -maxSize=200000  multiz6waySummary stdin
-# Created 2330531 summary blocks from 99659162 components and
-#       13316945 mafs from stdin
-    #   real    17m54.685s
+    # Created 1029626 summary blocks from 15177161 components and 
+    #      5500990 mafs from stdin
+    # real    9m46.074s
+
+    # Gap Annotation
+    # prepare bed files with gap info
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/anno
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/anno
+    mkdir maf run
+
+    #   most of these will already exist from previous multiple
+    #   alignments
+    #   remove the echo from in front of the twoBitInfo command to get
+    #   them
+    #   to run if this loop appears to be correct
+    for DB in `cat ../species.list`
+do
+    CDIR="/hive/data/genomes/${DB}"
+    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
+        echo "creating ${DB}.N.bed"
+        echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
+    else
+        ls -og ${CDIR}/${DB}.N.bed
+    fi
+done
+
+    cd run
+    rm -f nBeds sizes
+    for DB in `sed -e "s/felCatV17e //" ../../species.list`
+do
+    echo "${DB} "
+    ln -s  /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed
+    echo ${DB}.bed  >> nBeds
+    ln -s  /hive/data/genomes/${DB}/chrom.sizes ${DB}.len
+    echo ${DB}.len  >> sizes
+done
 
+    #   the annotation step requires large memory, run on memk nodes
+
+    ssh memk
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/anno/run
+    ls ../../run/maf | sed -e "s/.maf//" > chr.list
+    cat << '_EOF_' > template
+#LOOP
+./anno.csh $(root1) {check out line+ ../maf/$(root1).maf}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    cat << '_EOF_' > anno.csh
+#!/bin/csh -fe
+
+set inMaf = ../../run/maf/$1.maf
+set outMaf = ../maf/$1.maf
+rm -f $outMaf
+mafAddIRows -nBeds=nBeds $inMaf /hive/data/genomes/felCatV17e/felCatV17e.2bit $outMaf
+'_EOF_'
+    # << happy emacs
+    chmod +x anno.csh
+
+    gensub2 chr.list single template jobList
+    para -ram=30g create jobList
+    #   specify lots of ram to get one job per node
+    para -ram=30g push
+    #   
+# para time
+# Completed: 256 of 256 jobs
+# CPU time in finished jobs:       3735s      62.25m     1.04h    0.04d  0.000 y
+# IO & Wait Time:                  2382s      39.70m     0.66h    0.03d  0.000 y
+# Average job time:                  24s       0.40m     0.01h    0.00d
+# Longest finished job:             600s      10.00m     0.17h    0.01d
+# Submission to last job:           787s      13.12m     0.22h    0.01d
+
+    ssh hgwdev
+    rm -fr /gbdb/felCatV17e/multiz6way/maf
+    mkdir /gbdb/felCatV17e/multiz6way/maf
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/anno/maf
+    ln -s `pwd`/*.maf /gbdb/felCatV17e/multiz6way/maf/
+    #   by loading this into the table multiz6way, it will replace the
+    #   previously loaded table with the unannotated mafs
+    #   huge temp files are made, do them on local disk
+    cd /data/tmp
+    time nice -n +19 hgLoadMaf \
+        -pathPrefix=/gbdb/felCatV17e/multiz6way/maf felCatV17e multiz6way
+    #   real    93m33.812s
+    #   Loading multiz6way into database
+    #   Loaded 6501831 mafs in 256 files from /gbdb/felCatV17e/multiz6way/maf
+
+    time nice -n +19 cat /gbdb/felCatV17e/multiz6way/maf/*.maf \
+        | hgLoadMafSummary felCatV17e -minSize=30000 -mergeGap=1500 \
+                 -maxSize=200000  multiz6waySummary stdin
+    #   Created 1029626 summary blocks from 15177161 components \
+    #   and 6501831 mafs from stdin
+    #   Loading into felCatV17e table multiz6waySummary...
+    #   Loading complete
+    #   real    72m12.808s
+
+    #   by loading this into the table multiz6waySummary, it will
+    #   replace
+    #   the previously loaded table with the unannotated mafs
+    #   remove the multiz6way*.tab files in this /data/tmp directory
+# -rw-rw-r-- 1 338738590 May  6 15:49 multiz6way.tab
+# -rw-rw-r-- 1  49960609 May  6 17:04 multiz6waySummary.tab
+    wc -l multiz6way*.tab
+  #  6501831 multiz6way.tab
+  #  1029626 multiz6waySummary.tab
+  #  7531457 total
+    rm multiz6way*.tab
+
+    # create some downloads
+    mkdir -p /hive/data/genomes/felCatV17e/bed/multiz6way/download/maf
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/download/maf
+    # time cp -p ../../anno/maf/chr*.maf .
+    time cp -p ../../anno/maf/*.maf .
+    #   real    6m55.315s
+    time gzip --rsyncable *.maf
+    #   real    30m37.474s
+    time md5sum *.gz > md5sum.txt
+    #   real    1m36.814s
+    #   user    0m13.210s
+    #   sys     0m2.212s
 
 
 ####################################################################