src/hg/makeDb/doc/felCatV17e.txt 1.14
1.14 2010/05/13 21:38:33 chinhli
Conservation track
Index: src/hg/makeDb/doc/felCatV17e.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/felCatV17e.txt,v
retrieving revision 1.13
retrieving revision 1.14
diff -b -B -U 4 -r1.13 -r1.14
--- src/hg/makeDb/doc/felCatV17e.txt 7 May 2010 17:20:37 -0000 1.13
+++ src/hg/makeDb/doc/felCatV17e.txt 13 May 2010 21:38:33 -0000 1.14
@@ -981,9 +981,8 @@
grep -h -v "^#" ${F} >> felCatV17e.6way.maf
done
tail -q -n 1 maf/000.maf >> felCatV17e.6way.maf
- # real 13m32.340s
# load tables for a look
mkdir -p /gbdb/felCatV17e/multiz6way/maf
# cd /hive/data/genomes/felCatV17e/bed/multiz6way/maf
@@ -1297,4 +1295,270 @@
featureBits felCatV17e blastHg18KG xenoRefGene -enrichment
# blastHg18KG 1.197%, xenoRefGene 2.066%, both 1.010%, cover 84.36%, enrich 40.83x
rm -rf blastOut
#end tblastn
+
+
+############################################################################
+## Annotate 6-way multiple alignment with gene annotations
+## (working 2010-05-07 - Chin)
+ # Gene frames
+ ## survey all genomes to see what type of gene track to use
+ ssh hgwdev
+ mkdir /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+ cd /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+ #
+ # survey all the genomes to find out what kinds of gene tracks
+ # they have
+ cat << '_EOF_' > showGenes.csh
+#!/bin/csh -fe
+foreach db (`cat ../species.list`)
+ echo -n "${db}: "
+ set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
+ foreach table ($tables)
+ if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
+ $table == "knownGene" || $table == "xenoRefGene" ) then
+ set count = `hgsql $db -N -e "select count(*) from $table"`
+ echo -n "${table}: ${count}, "
+ endif
+ end
+ set orgName = `hgsql hgcentraltest -N -e \
+ "select scientificName from dbDb where name='$db'"`
+ set orgId = `hgsql felCatV17e -N -e \
+ "select id from organism where name='$orgName'"`
+ if ($orgId == "") then
+ echo "Mrnas: 0"
+ else
+ set count = `hgsql felCatV17e -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
+ echo "Mrnas: ${count}"
+ endif
+end
+'_EOF_'
+ # << happy emacs
+ chmod +x ./showGenes.csh
+ # rearrange that output to create four sections:
+ # 1. knownGenes for felCatV17e, mm9, rn4
+ # 2. ensGene for almost everything else
+ # 3. xenoRefGene for calJac1, petMar1, loxAfr3, papHam1, macEug1,
+ # oryCun2
+
+ mkdir genes
+ # knownGene
+ for DB in felCatV17e mm9 rn4
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+ tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+ bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+ proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+ taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6" \
+ | sed -e "s/ */ /g" > ensGene.list
+
+
+do
+ # ensGene
+ for DB in panTro2 gorGor1 ponAbe2 rheMac2 tarSyr1 micMur1 otoGar1 \
+ tupBel1 dipOrd1 cavPor3 speTri1 ochPri2 vicPac1 turTru1 \
+ bosTau4 equCab2 felCat3 canFam2 myoLuc1 pteVam1 eriEur1 sorAra1 \
+ proCap1 echTel1 dasNov2 choHof1 monDom5 ornAna1 galGal3 \
+ taeGut1 anoCar1 xenTro2 tetNig2 fr2 gasAcu1 oryLat2 danRer6
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ echo "calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2" > xenoRef.list
+
+ # xenoRefGene
+ for DB in calJac1 petMar1 loxAfr3 papHam1 macEug1 oryCun2
+do
+ hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \
+ | genePredSingleCover stdin stdout | gzip -2c \
+ > /scratch/tmp/${DB}.tmp.gz
+ mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
+ echo "${DB} done"
+done
+
+ # the following single command doesn't work on any 32 Gb computer,
+ # requires much more memory, turn it into a kluster job, see below
+ # ...
+
+ # Create this command with this script:
+ cat << '_EOF_' > mkCmd.sh
+#!/bin/sh
+
+echo "time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \\"
+for G in mm9 rn4
+do
+ if [ ! -s genes/${G}.gp.gz ]; then
+ echo "missing genes/${G}.gp.gz"
+ exit 255
+ fi
+ echo -n "${G} genes/${G}.gp.gz "
+done
+echo "\\"
+for D in `sort xenoRef.list`
+do
+ if [ ! -s genes/${D}.gp.gz ]; then
+ echo "missing genes/${D}.gp.gz"
+ exit 255
+ fi
+ echo -n "${D} genes/${D}.gp.gz "
+done
+echo "\\"
+echo " | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1"
+'_EOF_'
+ # << happy emacs
+ chmod +x ./mkCmd.sh
+
+ # this doesn't work on any 32 Gb computer, requires much more
+ # memory
+ # turn it into a kluster job, see below
+ time (cat ../maf/*.maf | nice -n +19 genePredToMafFrames felCatV17e stdin stdout \
+mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz \
+panTro2 genes/panTro2.gp.gz gorGor1 genes/gorGor1.gp.gz ponAbe2 genes/ponAbe2.gp.gz rheMac2 genes/rheMac2.gp.gz tarSyr1 genes/tarSyr1.gp.gz micMur1 genes/micMur1.gp.gz otoGar1 genes/otoGar1.gp.gz tupBel1 genes/tupBel1.gp.gz dipOrd1 genes/dipOrd1.gp.gz cavPor3 genes/cavPor3.gp.gz speTri1 genes/speTri1.gp.gz ochPri2 genes/ochPri2.gp.gz vicPac1 genes/vicPac1.gp.gz turTru1 genes/turTru1.gp.gz bosTau4 genes/bosTau4.gp.gz equCab2 genes/equCab2.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz myoLuc1 genes/myoLuc1.gp.gz pteVam1 genes/pteVam1.gp.gz eriEur1 genes/eriEur1.gp.gz sorAra1 genes/sorAra1.gp.gz proCap1 genes/proCap1.gp.gz echTel1 genes/echTel1.gp.gz dasNov2 genes/dasNov2.gp.gz choHof1 genes/choHof1.gp.gz monDom5 genes/monDom5.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz taeGut1 genes/taeGut1.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz tetNig2 genes/tetNig2.gp.gz fr2 genes/fr2.gp.gz gasAcu1 genes/gasAcu1.gp.gz oryLat2 genes/oryLat2.gp.gz danRer6 genes/danRer6.gp.gz \
+calJac1 genes/calJac1.gp.gz petMar1 genes/petMar1.gp.gz loxAfr3 genes/loxAfr3.gp.gz papHam1 genes/papHam1.gp.gz macEug1 genes/macEug1.gp.gz oryCun2 genes/oryCun2.gp.gz \
+ | gzip > multiz6way.mafFrames.gz) > frames.log 2>&1
+
+ # that doesn't work on any 32 Gb computer, requires much more
+ # memory
+ # turn it into a kluster job
+ ssh swarm
+ cd /hive/data/genomes/felCatV17e/bed/multiz6way/frames
+ cat << '_EOF_' > runOne
+#!/bin/csh -fe
+
+set C = $1
+set G = $2
+
+cat ../maf/${C}.maf | genePredToMafFrames felCatV17e stdin stdout \
+ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz
+'_EOF_'
+ # << happy emacs
+ chmod +x runOne
+
+ ls ../maf | sed -e "s/.maf//" > chr.list
+ ls genes | sed -e "s/.gp.gz//" | grep -v felCatV17e > gene.list
+
+ cat << '_EOF_' > template
+#LOOP
+runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz}
+#ENDLOOP
+'_EOF_'
+ # << happy emacs
+
+ mkdir parts
+ gensub2 chr.list gene.list template jobList
+ para -ram=8g create jobList
+ para try ... check ... push
+# Completed: 4185 of 4185 jobs
+# CPU time in finished jobs: 72491s 1208.19m 20.14h 0.84d
+# 0.002 y
+# IO & Wait Time: 1462162s 24369.36m 406.16h 16.92d
+# 0.046 y
+# Average job time: 367s 6.11m 0.10h 0.00d
+# Longest finished job: 3165s 52.75m 0.88h 0.04d
+# Submission to last job: 6364s 106.07m 1.77h 0.07d
+
+ # see what it looks like in terms of number of annotations per DB:
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | cut -f4 | sort | uniq -c | sort -n > annotation.survey.txt
+ 79191 rn4
+ 108287 petMar1
+ 139581 gorGor1
+ 140487 taeGut1
+ 143058 choHof1
+ 143233 vicPac1
+ 150073 anoCar1
+ 154462 tarSyr1
+ 163930 sorAra1
+ 164575 galGal3
+ 171191 macEug1
+ 174221 felCat3
+ 175831 dasNov2
+ 177622 ornAna1
+ 190729 eriEur1
+ 192285 tupBel1
+ 198052 speTri1
+ 199639 micMur1
+ 201731 papHam1
+ 201961 panTro2
+ 206170 oryCun2
+ 209327 ponAbe2
+ 209504 otoGar1
+ 210860 rheMac2
+ 212533 proCap1
+ 212848 myoLuc1
+ 213146 dipOrd1
+ 213479 calJac1
+ 215995 echTel1
+ 220341 ochPri2
+ 225132 loxAfr3
+ 226689 turTru1
+ 230903 monDom5
+ 232025 pteVam1
+ 232831 equCab2
+ 236945 cavPor3
+ 238167 bosTau4
+ 239857 mm9
+ 255727 canFam2
+ 316850 xenTro2
+ 359507 danRer6
+ 375156 oryLat2
+ 390076 fr2
+ 426532 gasAcu1
+ 434619 tetNig2
+
+ # load the resulting file
+ ssh hgwdev
+ cd /cluster/data/felCatV17e/bed/multiz6way/frames
+ find ./parts -type f | while read F
+do
+ zcat ${F}
+done | sort -k1,1 -k2,2n > multiz6wayFrames.bed
+
+ hgLoadMafFrames felCatV17e multiz6wayFrames stdin
+
+ featureBits -countGaps felCatV17e multiz6wayFrames.bed
+ # 57146632 bases of 3137161264 (1.822%) in intersection
+
+ # enable the trackDb entries:
+# frames multiz6wayFrames
+# irows on
+ # appears to work OK
+
+#############################################################################
+## create upstream refGene maf files
+ cd /hive/data/genomes/felCatV17e/bed/multiz6way/downloads/maf
+ # bash script
+#!/bin/sh
+for S in 1000 2000 5000
+do
+ echo "making upstream${S}.maf"
+ featureBits felCatV17e refGene:upstream:${S} -fa=/dev/null -bed=stdout \
+ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
+ | /cluster/bin/$MACHTYPE/mafFrags felCatV17e multiz6way \
+ stdin stdout \
+ -orgs=/hive/data/genomes/felCatV17e/bed/multiz6way/species.list \
+ | gzip -c > upstream${S}.maf.gz
+ echo "done upstream${S}.maf.gz"
+done
+
+ cd /usr/local/apache/htdocs/goldenPath/felCatV17e/multiz6way/maf
+ ln -s /hive/data/genomes/felCatV17e/bed/multiz6way/downloads/maf/up*.gz .
+ md5sum up*.gz >> md5sum.txt
+
+
+
+
+####################################################################