src/hg/makeDb/doc/calJac3.txt 1.13
1.13 2010/05/19 22:27:18 chinhli
13-Way multiz gene frames
Index: src/hg/makeDb/doc/calJac3.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/calJac3.txt,v
retrieving revision 1.12
retrieving revision 1.13
diff -b -B -U 4 -r1.12 -r1.13
--- src/hg/makeDb/doc/calJac3.txt 18 May 2010 23:52:28 -0000 1.12
+++ src/hg/makeDb/doc/calJac3.txt 19 May 2010 22:27:18 -0000 1.13
@@ -1389,10 +1389,10 @@
# 16590205 total
rm multiz13way*.tab
# create some downloads
- mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/download/maf
- cd /hive/data/genomes/calJac3/bed/multiz13way/download/maf
+ mkdir -p /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf
+ cd /hive/data/genomes/calJac3/bed/multiz13way/downloads/maf
time cp -p ../../anno/maf/*.maf .
# real 37m48.902s
time gzip --rsyncable *.maf
# real 64m55.554s
@@ -1509,9 +1509,186 @@
find -name "*.html" | xargs grep 2007
# downloads
cd /usr/local/apache/htdocs/goldenPath/calJac3
-find -name "README.txt" | xargs grep 2007
+find -name "*" | xargs grep 2007
# hgcentral
hgsql hgcentraltest
update dbDb set description="March 2009 (WUGSC 3.2/calJac3)" where name="calJac3"
+
+
+############################################################################
+## Annotate 13-way multiple alignment with gene annotations
+## DONE - 2010-05-17 - Chin)
+ # Gene frames
+ ## survey all genomes to see what type of gene track to use
+ ssh hgwdev
+ mkdir /hive/data/genomes/calJac3/bed/multiz13way/frames
+ cd /hive/data/genomes/calJac3/bed/multiz13way/frames
+ #
+ # survey all the genomes to find out what kinds of gene tracks
+ # they have
+ cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+ echo -n "${db}: "
+ set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+ foreach table ($tables)
+ if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+ $table == "knownGene" || $table == "xenoRefGene" ) then
+ set count = `hgsql $db -N -e "select count(*) from $table"`
+ echo -n "${table}: ${count}, "
+ endif
+ end
+ set orgName = `hgsql hgcentraltest -N -e \
+ "select scientificName from dbDb where name='$db'"`
+ set orgId = `hgsql calJac3 -N -e \
+ "select id from organism where name='$orgName'"`
+ if ($orgId == "") then
+ echo "Mrnas: 0"
+ else
+ set count = `hgsql calJac3 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+ echo "Mrnas: ${count}"
+ endif
+end
+'_EOF_'
+ # << happy emacs
+ chmod +x ./showGenes.csh
+
+ ./showGenes.csh
+ # calJac3: xenoRefGene: 235146, Mrnas: 3722
+ # ponAbe2: ensGene: 30230, refGene: 3466, xenoRefGene: 224711, Mrnas: 0
+ # hg19: ensGene: 143123, knownGene: 77614, mgcGenes: 31354,
+ refGene: 35776, xenoRefGene: 126698, Mrnas: 287363
+ # panTro2: ensGene: 40215, refGene: 743, xenoRefGene: 217765, Mrnas: 1535
+ # gorGor2: xenoRefGene: 223719, Mrnas: 1
+ # rheMac2: ensGene: 41617, refGene: 939, xenoRefGene: 285046, Mrnas: 5227
+ # papHam1: xenoRefGene: 249401, Mrnas: 70
+ # tarSyr1: ensGene: 44646, Mrnas: 8
+ # micMur1: ensGene: 36501, Mrnas: 43
+ # otoGar1: ensGene: 35274, Mrnas: 0
+ # mm9: ensGene: 83124, knownGene: 54042, mgcGenes: 26582,
+ refGene: 27131, xenoRefGene: 122280, Mrnas: 258076
+ # canFam2: ensGene: 30299, refGene: 1039, xenoRefGene: 210336, Mrnas: 3211
+ # monDom5: ensGene: 34745, refGene: 392, xenoRefGene: 213342, Mrnas: 1605
+
+
+ # rearrange that output to create 3 sections:
+ #1. knownGenes: hg19 mm9
+ #2. ensGene: ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5
+ #3. xenoRefGene: calJac3 papHam1 gorGor2
+
+
+ mkdir genes
+ # knownGene
+ echo "hg19 mm9" \
+ | sed -e "s/ */ /g" > knownGene.list
+
+
+ for DB in hg19 mm9
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5" \
+ | sed -e "s/ */ /g" > ensGene.list
+
+ # ensGene
+ for DB in ponAbe2 panTro2 rheMac2 tarSyr1 micMur1 otoGar1 canFam2 monDom5
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "calJac3 papHam1 gorGor2" > xenoRef.list
+ # xenoRefGene
+ for DB in calJac3 papHam1 gorGor2
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ # turn it into a kluster job
+ ssh swarm
+ cd /hive/data/genomes/calJac3/bed/multiz13way/frames
+ cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../run/maf/${C}.maf | genePredToMafFrames calJac3 stdin stdout \
+ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+ # << happy emacs
+ chmod +x runOne
+
+ ls ../run/maf | sed -e "s/.maf//" > chr.list
+ ls genes | sed -e "s/.gp.gz//" | grep -v calJac3 > gene.list
+
+ cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ mkdir parts
+ gensub2 chr.list gene.list template jobList
+ para -ram=8g create jobList
+ para try ... check ... push
+# Completed: 3072 of 3072 jobs
+# CPU time in finished jobs: 5599s 93.32m 1.56h 0.06d 0.000 y
+# IO & Wait Time: 279159s 4652.65m 77.54h 3.23d 0.009 y
+# Average job time: 93s 1.54m 0.03h 0.00d
+# Longest finished job: 1292s 21.53m 0.36h 0.01d
+# Submission to last job: 1380s 23.00m 0.38h 0.02d
+
+ # see what it looks like in terms of number of annotations per DB:
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+
+ cat annotation.survey.txt
+ # 159735 tarSyr1
+ # 202861 panTro2
+ # 203073 rheMac2
+ # 205358 ponAbe2
+ # 205679 hg19
+ # 206899 micMur1
+ # 211208 papHam1
+ # 211692 monDom5
+ # 213918 otoGar1
+ # 217594 gorGor2
+ # 233938 mm9
+ # 240194 canFam2
+
+ # load the resulting file
+ ssh hgwdev
+ cd /hive/data/genomes/calJac3/bed/multiz13way/frames
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | sort -k1,1 -k2,2n > multiz13wayFrames.bed
+
+ hgLoadMafFrames calJac3 multiz13wayFrames multiz13wayFrames.bed
+
+ featureBits -countGaps calJac3 multiz13wayFrames.bed
+ # 40132771 bases of 2914958544 (1.377%) in intersection
+
+ # enable the trackDb entries:
+# frames multiz13wayFrames
+# irows on
+ # appears to work OK
+