src/hg/makeDb/doc/danRer6.txt 1.17

1.17 2010/02/16 23:42:14 galt
multiz6way, annotated mafs, annotated and upstream downloads, frames
Index: src/hg/makeDb/doc/danRer6.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/danRer6.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 4 -r1.16 -r1.17
--- src/hg/makeDb/doc/danRer6.txt	26 Jan 2010 23:54:07 -0000	1.16
+++ src/hg/makeDb/doc/danRer6.txt	16 Feb 2010 23:42:14 -0000	1.17
@@ -1040,5 +1040,337 @@
     # Make README.txt
     ln -s /cluster/data/danRer6/goldenPath/chromosomes \
       /usr/local/apache/htdocs/goldenPath/danRer6/chromosomes
 
+
+#########################################################################
+## 6-Way Multiz (WORKING - 2010-02-01 - Galt)
+    ssh hgwdev
+    mkdir /cluster/data/danRer6/bed/multiz6way
+    cd /cluster/data/danRer6/bed/multiz6way
+
+    wget -O 46way.nh http://hgdownload.cse.ucsc.edu/goldenPath/hg19/multiz46way/46way.corrected.nh
+
+# ((Zebrafish,
+# (Stickleback,
+# Tetraodon)),
+# (X_tropicalis,
+# (Mouse,
+# Human))) 
+
+
+    #	All distances remain as specified in the 46way.nh
+    /cluster/bin/phast/tree_doctor --prune-all-but danRer6,gasAcu1,tetNig2,xenTro2,mm9,hg19 46way.nh > 6way.nh
+
+#(((hg19:0.131183,mm9:0.352605):0.525173,xenTro2:0.852482):0.300396,((tetNig2:0.416610,gasAcu1:0.372371):0.322824,danRer6:0.731166):0.155214);
+
+    #	Use this specification in the phyloGif tool:
+    #	http://genome.ucsc.edu/cgi-bin/phyloGif
+    #	to obtain a gif image for /usr/local/apache/htdocs/images/phylo/danRer6_6way.gif
+    #   See below, I redo this on species names.
+    wget -O /usr/local/apache/htdocs/images/phylo/danRer6_6way.gif \
+'http://genome.ucsc.edu/cgi-bin/phyloGif?hgsid=151159276&phyloGif_width=200&phyloGif_height=320&boolshad.phyloGif_branchLengths=0&boolshad.phyloGif_lengthLegend=0&boolshad.phyloGif_branchLabels=0&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&phyloGif_underscores=on&boolshad.phyloGif_underscores=0&boolshad.phyloGif_htmlPage=0&phyloGif_tree=(((Human%3A0.131183%2CMouse%3A0.352605)%3A0.525173%2CX.tropicalis%3A0.852482)%3A0.300396%2C((Tetraodon%3A0.416610%2CStickleback%3A0.372371)%3A0.322824%2CZebrafish%3A0.731166)%3A0.155214)%3B%0D%0A&phyloGif_submit=submit'
+
+    /cluster/bin/phast/all_dists 6way.nh > 6way.distances.txt
+    #	Use this output to create the table below
+    grep -i danRer6 6way.distances.txt | sort -k3,3n
+
+#gasAcu1 danRer6 1.426361
+#tetNig2 danRer6 1.470600
+#hg19    danRer6 1.843132
+#xenTro2 danRer6 2.039258
+#mm9     danRer6 2.064554
+
+
+# hiram says these should already have been created 
+# and are in the build dirs already.
+
+featureBits danRer6 chainGasAcu1Link
+#147479454 bases of 1506896106 (9.787%) in intersection
+featureBits gasAcu1 chainDanRer6Link
+#114451338 bases of 446627861 (25.626%) in intersection
+
+featureBits danRer6 chainTetNig2Link
+#93349443 bases of 1506896106 (6.195%) in intersection
+featureBits tetNig2 chainDanRer6Link
+#70626082 bases of 302314788 (23.362%) in intersection
+
+featureBits danRer6 chainHg19Link
+#96424507 bases of 1506896106 (6.399%) in intersection
+featureBits hg19 chainDanRer6Link
+#88391631 bases of 2897316137 (3.051%) in intersection
+
+featureBits danRer6 chainXenTro2Link
+#100078259 bases of 1506896106 (6.641%) in intersection
+featureBits xenTro2 chainDanRer6Link
+#92089833 bases of 1359412157 (6.774%) in intersection
+
+featureBits danRer6 chainMm9Link
+#77099032 bases of 1506896106 (5.116%) in intersection
+featureBits mm9 chainDanRer6Link
+#73444297 bases of 2620346127 (2.803%) in intersection
+
+#
+#	If you can fill in all the numbers in this table, you are ready for
+#	the multiple alignment procedure
+#
+
+#                         featureBits chainLink measures
+#                                           chainDanRer6Link  chain linearGap
+#    distance                    on danRer6  on other     minScore
+#  1  1.43 - stickleback gasAcu1   (% 9.79) (% 25.63)       2000     loose
+#  2  1.47 - tetraodon tetNig2     (% 6.20) (% 23.36)       2000     loose
+#  3  1.84 - human hg19            (% 6.40) (%  3.05)       2000     loose
+#  4  2.04 - xenopus xenTro2       (% 6.64) (%  6.77)       2000     loose
+#  5  2.06 - mouse mm9             (% 5.12) (%  2.80)       2000     loose
+
+# None of this concern for distances matters in building the first step, the
+# maf files.
+
+    # fix missing symlinks
+    cd /hive/data/genomes/danRer6/bed/
+    ln -s lastzMm9.2009-12-17/ lastz.mm9
+    ln -s lastzXenTro2.2009-12-22/ lastz.xenTro2
+    ln -s blastz.hg19.swap/ lastz.hg19
+    ln -s blastz.tetNig2.swap/ lastz.tetNig2
+
+    cd /hive/data/genomes/danRer6/bed/multiz6way
+
+    #	bash shell syntax here ...
+    export H=/cluster/data/danRer6/bed
+    mkdir mafLinks
+    for G in gasAcu1 tetNig2 hg19 xenTro2 mm9
+    do
+	mkdir mafLinks/$G
+	if [ ! -d ${H}/lastz.${G}/mafNet ]; then
+	echo "missing directory lastz.${G}/mafNet"
+		exit 255
+	fi
+	ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G
+    done
+
+    # location for cluster run
+    #/hive/data/genomes/danRer6/bed/multiz6way/mafLinks/ 
+
+    mkdir penn
+    cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn
+    cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn
+    cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn
+
+    # create species list and stripped down tree for autoMZ
+    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
+	6way.nh > tmp.nh
+    echo `cat tmp.nh` > tree-commas.nh
+    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
+    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst
+
+
+    # this one is simple enough that it doesn't need a cluster run.
+
+    cat > autoMultiz << '_EOF_'
+#!/bin/csh -ef
+set db = danRer6
+set binDir = /hive/data/genomes/danRer6/bed/multiz6way/penn
+rm -fr tmp
+mkdir -p tmp
+cd tmp
+foreach s (`cat ../species.lst`)
+    if ($s != $db) then
+        zcat ../mafLinks/$s/$db.$s.net.maf > $db.$s.sing.maf
+    endif
+end
+set path = ($binDir $path); rehash
+$binDir/autoMZ + T=. E=$db "`cat ../tree.nh`" $db.*.sing.maf ../multiz6way.maf
+cd ..
+rm -fr $tmp
+'_EOF_'
+    # << happy emacs
+    chmod +x autoMultiz
+
+    screen
+    ./autoMultiz > & autoMultiz.log
+    # took 3 hours, log shows no errors.
+
+    # Load into database
+    ssh hgwdev
+    cd /hive/data/genomes/danRer6/bed/multiz6way
+    mkdir /gbdb/danRer6/multiz6way
+    ln -s /hive/data/genomes/danRer6/bed/multiz6way/multiz6way.maf \
+	/gbdb/danRer6/multiz6way
+    time nice hgLoadMaf danRer6 multiz6way
+    #	Loaded 2228353 mafs in 1 files from /gbdb/danRer6/multiz6way
+    #	43 sec
+
+    time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \
+	-maxSize=50000 danRer6 multiz6waySummary multiz6way.maf
+    #	Created 786724 summary blocks from 4850023 components 
+    #		and 2228353 mafs from multiz6way.maf
+    #	49 sec
+
+    wc -l multiz6way*.tab
+    #  2228353 multiz6way.tab
+    #   786724 multiz6waySummary.tab
+    #  3015077 total
+    rm multiz6way*.tab
+
+    #	Create tree image for details page
+    #	You can get a better image from the phyloGif tool:
+    #	http://genome.ucsc.edu/cgi-bin/phyloGif
+    cp 6way.nh treeImage.nh
+    #	Edit treeImage.nh to make the names as desired
+
+# (
+# (zebrafish,(tetraodon,stickleback)),
+# (X._tropicalis,(mouse,human))
+# );
+
+    #	Submit this treeImage.nh to the phyloGif program
+
+    # Organism Image
+    pushd $HOME/browser/images/phylo
+    wget -O danRer6_6way.gif \
+	'http://hgwdev.cse.ucsc.edu/cgi-bin/phyloGif?hgsid=1831836&phyloGif_width=200&phyloGif_height=320&boolshad.phyloGif_branchLengths=0&boolshad.phyloGif_lengthLegend=0&boolshad.phyloGif_branchLabels=0&phyloGif_branchDecimals=2&phyloGif_branchMultiplier=1&phyloGif_underscores=on&boolshad.phyloGif_underscores=0&boolshad.phyloGif_htmlPage=0&phyloGif_tree=(%0D%0A(zebrafish%2C(tetraodon%2Cstickleback))%2C%0D%0A(X._tropicalis%2C(mouse%2Chuman))%0D%0A)%3B&phyloGif_submit=submit'
+    #  check this .jpg file into the browser doc source tree images directory
+    cvs add -kb danRer6_6way.gif 
+    cvs commit -m 'organism tree for danRer6 6way' danRer6_6way.gif 
+    cp -p danRer6_6way.gif /usr/local/apache/htdocs/images/phylo/
+    popd
+
+
 ############################################################################
+# GAP ANNOTATE MULTIZ6WAY MAF AND LOAD TABLES (DONE - 2010-02-08 - Galt)
+
+    ssh hgwdev
+    mkdir /hive/data/genomes/danRer6/bed/multiz6way/anno
+    cd /hive/data/genomes/danRer6/bed/multiz6way/anno
+    screen
+    mafAddIRows ../multiz6way.maf /hive/data/genomes/danRer6/danRer6.2bit multiz6way.maf
+    # 5 mins
+
+    # Load into database
+    rm /gbdb/danRer6/multiz6way/multiz6way.maf   # remove old symlink
+    ln -s /hive/data/genomes/danRer6/bed/multiz6way/anno/multiz6way.maf \
+	/gbdb/danRer6/multiz6way
+    time nice hgLoadMaf danRer6 multiz6way
+
+    #	Loaded 2644222 mafs in 1 files from /gbdb/danRer6/multiz6way
+    #	2 mins 
+
+    time nice hgLoadMafSummary -minSize=10000 -mergeGap=500 \
+	-maxSize=50000 danRer6 multiz6waySummary multiz6way.maf
+    #	Created 786724 summary blocks from 4850023 components 
+    #   and 2644222 mafs from multiz6way.maf
+    #   1 mins
+
+    wc -l multiz6way*.tab
+    #  2644222 multiz6way.tab
+    #   786724 multiz6waySummary.tab
+    #  3430946 total
+    rm multiz6way*.tab
+
+
+#########################################################################
+# Adding automatic generation of upstream files (DONE - 2010-02-09 - Galt)
+    # edit src/hg/makeDb/genbank/etc/genbank.conf to add:
+danRer6.upstreamGeneTbl = refGene
+danRer6.upstreamMaf = multiz6way /hive/data/genomes/danRer6/bed/multiz6way/species.lst
+
+#########################################################################
+# MULTIZ6WAY DOWNLOADABLES (DONE - 2010-02-09 - Galt)
+
+    # create some downloads
+    mkdir -p /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
+    cd /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
+    time cp -p ../../anno/*.maf .
+    # 1 mins
+    time gzip --rsyncable *.maf
+    # 11 mins
+    time md5sum *.gz > md5sum.txt
+    # 1 mins
+
+    mkdir -p /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
+    cd /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
+    ln -s /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf/* .
+
+#############################################################################
+## create upstream refGene maf files
+    cd /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf
+    # bash script
+#!/bin/sh
+for S in 1000 2000 5000
+do
+    echo "making upstream${S}.maf"
+    featureBits danRer6 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
+        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
+        | /cluster/bin/$MACHTYPE/mafFrags danRer6 multiz6way \
+                stdin stdout \
+                -orgs=/hive/data/genomes/danRer6/bed/multiz6way/species.lst \
+        | gzip -c > upstream${S}.maf.gz
+    echo "done upstream${S}.maf.gz"
+done
+
+    cd /usr/local/apache/htdocs/goldenPath/danRer6/multiz6way/maf
+    ln -s /hive/data/genomes/danRer6/bed/multiz6way/downloads/maf/up*.gz .
+    md5sum up*.gz >> md5sum.txt
+
+
+#######################################################################
+# MULTIZ6WAY MAF FRAMES (DONE - 2010-02-16 - galt)
+    ssh hgwdev
+    mkdir /cluster/data/danRer6/bed/multiz6way/frames
+    cd /cluster/data/danRer6/bed/multiz6way/frames
+    mkdir genes
+
+#danRer6 refGene
+#gasAcu1 ensGene
+#tetNig2 ensGene
+#hg19    knownGene
+#xenTro2 refGene
+#mm9     knownGene
+    #------------------------------------------------------------------------
+    # get the genes for all genomes
+    # mRNAs with CDS.  single select to get cds+psl, then split that up and
+    # create genePred
+    # using refGene for danRer6 xenTro2
+    # using ensGene for gasAcu1 and tetNig2
+    # bash syntax
+    bash
+    for qDB in danRer6 xenTro2 gasAcu1 tetNig2
+    do
+      if [ $qDB = "gasAcu1" -o $qDB = "tetNig2" ]; then
+        geneTbl=ensGene
+      else
+	geneTbl=refGene
+      fi
+      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
+      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 2-11 \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/$qDB.tmp.gz
+      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
+      rm -f $tmpExt
+    done
+
+    # using knownGene for mm9 hg19
+    # genePreds; (must keep only the first 10 columns for knownGene)
+    for qDB in mm9 hg19
+    do
+        geneTbl=knownGene
+      echo hgsql -N -e \"'select * from '$geneTbl\;\" ${qDB}
+      hgsql -N -e "select * from $geneTbl" ${qDB} | cut -f 1-10 \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/$qDB.tmp.gz
+      mv /scratch/tmp/$qDB.tmp.gz genes/$qDB.gp.gz
+      rm -f $tmpExt
+    done
+
+    ###
+    exit # end bash shell
+    
+    ssh hgwdev
+    cd /cluster/data/danRer6/bed/multiz6way/frames
+    cat ../anno/multiz6way.maf | nice genePredToMafFrames danRer6 stdin stdout danRer6 genes/danRer6.gp.gz gasAcu1 genes/gasAcu1.gp.gz tetNig2 genes/tetNig2.gp.gz hg19 genes/hg19.gp.gz mm9 genes/mm9.gp.gz xenTro2 genes/xenTro2.gp.gz | gzip > multiz6way.mafFrames.gz
+
+    ssh hgwdev
+    cd /cluster/data/danRer6/bed/multiz6way/frames
+    nice hgLoadMafFrames danRer6 multiz6wayFrames multiz6way.mafFrames.gz
+
+