src/hg/makeDb/doc/felCatV17e.txt 1.14

1.14 2010/05/13 21:38:33 chinhli
Conservation track
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 4 -r1.13 -r1.14
--- src/hg/makeDb/doc/felCatV17e.txt	7 May 2010 17:20:37 -0000	1.13
+++ src/hg/makeDb/doc/felCatV17e.txt	13 May 2010 21:38:33 -0000	1.14
@@ -981,9 +981,8 @@
     grep -h -v "^#" ${F} >> felCatV17e.6way.maf
 done
     tail -q -n 1 maf/000.maf >> felCatV17e.6way.maf
 
-    #   real    13m32.340s
 
     # load tables for a look
     mkdir -p /gbdb/felCatV17e/multiz6way/maf
     # cd /hive/data/genomes/felCatV17e/bed/multiz6way/maf
@@ -1297,4 +1295,270 @@
     featureBits felCatV17e blastHg18KG xenoRefGene  -enrichment
 # blastHg18KG 1.197%, xenoRefGene 2.066%, both 1.010%, cover 84.36%, enrich 40.83x
     rm -rf blastOut
 #end tblastn
+
+
+############################################################################
+## Annotate 6-way multiple alignment with gene annotations
+##              (working 2010-05-07 - Chin)
+    # Gene frames
+    ## survey all genomes to see what type of gene track to use
+    ssh hgwdev
+    mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+    #
+    #   survey all the genomes to find out what kinds of gene tracks
+    #   they have
+    cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+    echo -n "${db}: "
+    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+    foreach table ($tables) 
+        if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+            $table == "knownGene" || $table == "xenoRefGene" ) then
+                set count = `hgsql $db -N -e "select count(*) from $table"`
+                echo -n "${table}: ${count}, "
+        endif
+    end
+    set orgName = `hgsql hgcentraltest -N -e \
+            "select scientificName from dbDb where name='$db'"`
+    set orgId = `hgsql felCatV17e -N -e \
+            "select id from organism where name='$orgName'"`
+    if ($orgId == "") then
+        echo "Mrnas: 0"
+    else
+        set count = `hgsql felCatV17e -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+        echo "Mrnas: ${count}"
+    endif
+end
+'_EOF_'
+    # << happy emacs
+    chmod +x ./showGenes.csh
+    #   rearrange that output to create four sections:
+    #   1. knownGenes for felCatV17e, mm9, rn4
+    #   2. ensGene for almost everything else
+    #   3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1,
+    #   oryCun2
+
+    mkdir genes
+    # knownGene
+    for DB in felCatV17e mm9 rn4
+do
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+        tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+        bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+        proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+        taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
+    | sed -e "s/  */ /g" > ensGene.list
+
+
+do  
+    # ensGene 
+    for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+        tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+        bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+        proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+        taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
+do  
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
+
+    # xenoRefGene
+    for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
+do  
+    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+      | genePredSingleCover stdin stdout | gzip -2c \
+        > /scratch/tmp/${DB}.tmp.gz
+    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+    echo "${DB} done"
+done
+
+    #   the following single command doesn't work on any 32 Gb computer,
+    #   requires much more memory, turn it into a kluster job, see below
+    #   ...
+
+    #   Create this command with this script:
+    cat << '_EOF_' > mkCmd.sh
+#!/bin/sh
+
+echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \\"
+for G in mm9 rn4
+do  
+    if [ ! -s genes/${G}.gp.gz ]; then
+        echo "missing genes/${G}.gp.gz"
+        exit 255
+    fi
+    echo -n "${G} genes/${G}.gp.gz "
+done 
+echo "\\"
+for D in `sort xenoRef.list`
+do
+    if [ ! -s genes/${D}.gp.gz ]; then
+        echo "missing genes/${D}.gp.gz"
+        exit 255
+    fi
+    echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+echo "    | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1"
+'_EOF_'
+    # << happy emacs
+    chmod +x ./mkCmd.sh
+
+    #   this doesn't work on any 32 Gb computer, requires much more
+    #   memory
+    #   turn it into a kluster job, see below
+    time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \
+mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
+panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
+calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
+    | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
+
+    #   that doesn't work on any 32 Gb computer, requires much more
+    #   memory
+    #   turn it into a kluster job
+    ssh swarm
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+    cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../maf/${C}.maf | genePredToMafFrames felCatV17e stdin stdout \
+        ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+    # << happy emacs
+    chmod +x runOne
+
+    ls ../maf | sed -e "s/.maf//" > chr.list
+    ls genes | sed -e "s/.gp.gz//" | grep -v felCatV17e > gene.list
+
+    cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+    # << happy emacs
+
+    mkdir parts
+    gensub2 chr.list gene.list template jobList
+    para -ram=8g create jobList
+    para try ... check ... push
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs:      72491s    1208.19m    20.14h    0.84d
+# 0.002 y
+# IO & Wait Time:               1462162s   24369.36m   406.16h   16.92d
+# 0.046 y
+# Average job time:                 367s       6.11m     0.10h    0.00d
+# Longest finished job:            3165s      52.75m     0.88h    0.04d
+# Submission to last job:          6364s     106.07m     1.77h    0.07d
+
+    # see what it looks like in terms of number of annotations per DB:
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+  79191 rn4
+ 108287 petMar1
+ 139581 gorGor1
+ 140487 taeGut1
+ 143058 choHof1
+ 143233 vicPac1
+ 150073 anoCar1
+ 154462 tarSyr1
+ 163930 sorAra1
+ 164575 galGal3
+ 171191 macEug1
+ 174221 felCat3
+ 175831 dasNov2
+ 177622 ornAna1
+ 190729 eriEur1
+ 192285 tupBel1
+ 198052 speTri1
+ 199639 micMur1
+ 201731 papHam1
+ 201961 panTro2
+ 206170 oryCun2
+ 209327 ponAbe2
+ 209504 otoGar1
+ 210860 rheMac2
+ 212533 proCap1
+ 212848 myoLuc1
+ 213146 dipOrd1
+ 213479 calJac1
+ 215995 echTel1
+ 220341 ochPri2
+ 225132 loxAfr3
+ 226689 turTru1
+ 230903 monDom5
+ 232025 pteVam1
+ 232831 equCab2
+ 236945 cavPor3
+ 238167 bosTau4
+ 239857 mm9
+ 255727 canFam2
+ 316850 xenTro2
+ 359507 danRer6
+ 375156 oryLat2
+ 390076 fr2
+ 426532 gasAcu1
+ 434619 tetNig2
+
+    #   load the resulting file
+    ssh hgwdev
+    cd /cluster/data/felCatV17e/bed/multiz6way/frames
+    find ./parts -type f | while read F
+do
+    zcat ${F}
+done | sort -k1,1 -k2,2n > multiz6wayFrames.bed
+
+    hgLoadMafFrames felCatV17e multiz6wayFrames stdin
+
+    featureBits -countGaps felCatV17e multiz6wayFrames.bed
+    #   57146632 bases of 3137161264 (1.822%) in intersection
+
+    #   enable the trackDb entries:
+# frames multiz6wayFrames
+# irows on
+    #   appears to work OK
+
+#############################################################################
+## create upstream refGene maf files
+    cd /hive/data/genomes/felCatV17e/bed/multiz6way/downloads/maf
+    # bash script
+#!/bin/sh
+for S in 1000 2000 5000
+do
+    echo "making upstream${S}.maf"
+    featureBits felCatV17e refGene:upstream:${S} -fa=/dev/null -bed=stdout \
+        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
+        | /cluster/bin/$MACHTYPE/mafFrags felCatV17e multiz6way \
+                stdin stdout \
+                -orgs=/hive/data/genomes/felCatV17e/bed/multiz6way/species.list \
+        | gzip -c > upstream${S}.maf.gz
+    echo "done upstream${S}.maf.gz"
+done
+
+    cd /usr/local/apache/htdocs/goldenPath/felCatV17e/multiz6way/maf
+    ln -s /hive/data/genomes/felCatV17e/bed/multiz6way/downloads/maf/up*.gz .
+    md5sum up*.gz >> md5sum.txt
+
+
+
+
+####################################################################