src/hg/makeDb/doc/makeEnsembl.txt 1.16
1.16 2010/04/07 17:43:30 hiram
finished with v57 ensGene update
Index: src/hg/makeDb/doc/makeEnsembl.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/makeEnsembl.txt,v
retrieving revision 1.15
retrieving revision 1.16
diff -b -B -U 4 -r1.15 -r1.16
--- src/hg/makeDb/doc/makeEnsembl.txt 5 Apr 2010 20:17:13 -0000 1.15
+++ src/hg/makeDb/doc/makeEnsembl.txt 7 Apr 2010 17:43:30 -0000 1.16
@@ -8,8 +8,111 @@
############################################################################
# ensembl 57 update (WORKING - 2010-04-02 - Hiram)
############################################################################
+# susScr1 - Pig - Ensembl Genes version 57 (DONE - 2010-04-05 - hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/susScr1
+ cat << '_EOF_' > susScr1.ensGene.ra
+# required db variable
+db susScr1
+# optional nameTranslation, the sed command that will transform
+# Ensemble names to UCSC names. With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/;"
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 susScr1.ensGene.ra
+ ssh hgwdev
+ cd /hive/data/genomes/susScr1/bed/ensGene.57
+ featureBits susScr1 ensGene
+ # 28707614 bases of 2231332019 (1.287%) in intersection
+
+############################################################################
+# susScr2 - Pig - lifted susScr1 v57 genes to susScr2 (DONE - 2010-04-06 - Hiram)
+ mkdir /hive/data/genomes/susScr2/bed/ensGene.57
+ cd /hive/data/genomes/susScr2/bed/ensGene.57
+ ln -s ../../../susScr1/bed/ensGene.57/process/susScr1.allGenes.gp.gz .
+ zcat susScr1.allGenes.gp.gz > susScr1.allGenes.genePred
+ ln -s ../../../susScr1/bed/liftOver/susScr1ToSusScr2.over.chain.gz
+ zcat susScr1ToSusScr2.over.chain.gz > susScr1ToSusScr2.over.chain
+ liftOver -genePred susScr1.allGenes.genePred \
+ susScr1ToSusScr2.over.chain \
+ susScr2.allGenes.gp susScr1.liftOver.unMapped.txt
+ hgLoadGenePred -genePredExt susScr2 \
+ ensGene susScr2.allGenes.gp.gz >& loadGenePred.errors.txt
+ zcat \
+../../../susScr1/bed/ensGene.57/download/Sus_scrofa.Sscrofa9.57.pep.all.fa.gz \
+ | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//;' \
+ | gzip > ensPep.txt.gz
+ zcat ensPep.txt.gz \
+ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
+ | sed -e '/^$/d; s/*$//' | sort > ensPep.susScr2.fa.tab
+ hgPepPred susScr2 tab ensPep ensPep.susScr2.fa.tab
+ ln -s ../../../susScr1/bed/ensGene.57/process/ensGtp.tab .
+ hgLoadSqlTab susScr2 ensGtp ~/kent/src/hg/lib/ensGtp.sql ensGtp.tab
+ hgsql -e 'INSERT INTO trackVersion \
+ (db, name, who, version, updateTime, comment, source, dateReference) \
+ VALUES("susScr2", "ensGene", "hiram", "57", now(), \
+ "with peptides Sus_scrofa.Sscrofa9.57.pep.all.fa.gz", \
+ "lifted susScr1 to susScr2 ftp://ftp.ensembl.org/pub/release-57/gtf/sus_scrofa/Sus_scrofa.Sscrofa9.57.gtf.gz", \
+ "mar2010" );' hgFixed
+
+############################################################################
+# loxAfr3 - Elephant - Ensembl Genes version 57 (DONE - 2010-04-01 - hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/loxAfr3
+ cat << '_EOF_' > loxAfr3.ensGene.ra
+# required db variable
+db loxAfr3
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 loxAfr3.ensGene.ra
+ ssh hgwdev
+ cd /hive/data/genomes/loxAfr3/bed/ensGene.57
+ featureBits loxAfr3 ensGene
+ # 32110794 bases of 3118565340 (1.030%) in intersection
+
+############################################################################
+# gasAcu1 - Stickleback - Ensembl Genes version 57 (DONE - 2010-04-06 - hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/gasAcu1
+ cat << '_EOF_' > gasAcu1.ensGene.ra
+# required db variable
+db gasAcu1
+# optional nameTranslation, the sed command that will transform
+# Ensemble names to UCSC names. With quotes just to make sure.
+nameTranslation "s/^group\([IUVX]\)/chr\1/; s/^MT/chrM/;"
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 gasAcu1.ensGene.ra
+
+ # requires extra attention after the all database for loop attempt
+ cd /hive/data/genomes/gasAcu1/bed/ensGene.57/process
+ mv gasAcu1.allGenes.gp.gz gasAcu1.allGenes.gp.beforeLift.gz
+ zcat gasAcu1.allGenes.gp.beforeLift.gz \
+ | liftUp -extGenePred -type=.gp gasAcu1.scaffolds.gp \
+ ../../../jkStuff/contigsToScaffolds.lft carry stdin
+ liftUp -extGenePred gasAcu1.allGenes.gp \
+ ../../../jkStuff/UCSC.chromToScaffoldSansGaps.lft carry \
+ gasAcu1.scaffolds.gp
+ gzip gasAcu1.scaffolds.gp
+ gzip gasAcu1.allGenes.gp
+ # verify OK
+ genePredCheck -db=gasAcu1 gasAcu1.allGenes.gp.gz
+ # checked: 29109 failed: 0
+
+ # then continue with the load
+ cd /hive/data/genomes/gasAcu1
+ doEnsGeneUpdate.pl -continue=load -ensVersion=57 gasAcu1.ensGene.ra \
+ > ens.57.load 2>&1
+
+ featureBits gasAcu1 ensGene
+ # 36789271 bases of 446627861 (8.237%) in intersection
+
+############################################################################
# anoCar1 - Lizard - Ensembl Genes version 57 (DONE - 2010-04-05 - hiram)
ssh hgwdev
cd /hive/data/genomes/anoCar1
cat << '_EOF_' > anoCar1.ensGene.ra
@@ -429,8 +532,37 @@
featureBits xenTro2 ensGene
# 29158032 bases of 1359412157 (2.145%) in intersection
############################################################################
+# bosTau4 was broken - finished manually (DONE - 2010-04-05 - Hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/bosTau4
+ cat << '_EOF_' > bosTau4.ensGene.ra
+# required db variable
+db bosTau4
+# optional nameTranslation, the sed command that will transform
+# Ensemble names to UCSC names. With quotes just to make sure.
+nameTranslation "s/^\([0-9UX][0-9n]*\)/chr\1/; s/^MT/chrM/"
+# cause SQL tables to be fetched to see if chrUn can be fixed up
+# geneScaffolds yes
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 bosTau4.ensGene.ra
+ # broken during processing, fix doProcess.csh to eliminate AAFC03011182
+ zcat ../download/Bos_taurus.Btau_4.0.57.gtf.gz \
+ | sed -e "s/^\([0-9UX][0-9n]*\)/chr\1/; s/^MT/chrM/" \
+ | grep -v AAFC03011182 | gzip > allGenes.gtf.gz
+ ssh hgwdev
+ cd /hive/data/genomes/bosTau4
+ doEnsGeneUpdate.pl -ensVersion=57 -continue=load bosTau4.ensGene.ra \
+ > ens.57.load
+ cd /hive/data/genomes/bosTau4/bed/ensGene.57
+ featureBits bosTau4 ensGene
+ # 42207115 bases of 2731830700 (1.545%) in intersection
+
+
+############################################################################
# oryCun2 - Rabbit - Ensembl Genes version 57 (DONE - 2010-04-02 - hiram)
ssh hgwdev
cd /hive/data/genomes/oryCun2
cat << '_EOF_' > oryCun2.ensGene.ra
@@ -447,8 +579,120 @@
featureBits oryCun2 ensGene
# 31748363 bases of 2604023284 (1.219%) in intersection
############################################################################
+# felCat3 - Cat - Ensembl Genes version 57 (DONE - 2010-04-05 - hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/felCat3
+ cat << '_EOF_' > felCat3.ensGene.ra
+# required db variable
+db felCat3
+# do we need to translate geneScaffold coordinates
+geneScaffolds yes
+# ignore genes that do not properly convert to a gene pred, and contig
+# names that are not in the UCSC assembly
+skipInvalid yes
+# ignore the three genes that have invalid structures from Ensembl:
+# 2100: ENSFCAT00000006929 no exonFrame on CDS exon 16
+# 14578: ENSFCAT00000010965 no exonFrame on CDS exon 1
+# 26634: ENSFCAT00000009384 no exonFrame on CDS exon 0
+
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 felCat3.ensGene.ra
+ ssh hgwdev
+ cd /hive/data/genomes/felCat3/bed/ensGene.57
+ featureBits felCat3 ensGene
+ # 22220711 bases of 1642698377 (1.353%) in intersection
+
+############################################################################
+# mm9 - Mouse - Ensembl Genes version 57 (DONE - 2010-04-06 - hiram)
+ cd /hive/data/genomes/mm9
+ cat << '_EOF_' > mm9.ensGene.ra
+# required db variable
+db mm9
+# optional liftRandoms yes/no or absent
+liftRandoms yes
+# optional nameTranslation, the sed command that will transform
+# Ensemble names to UCSC names. With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
+# optionally update the knownToEnsembl table after ensGene updated
+knownToEnsembl yes
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 mm9.ensGene.ra
+
+ cd /hive/data/genomes/mm9/bed/ensGene.57
+ # ran into trouble with the ensGtp table load, the names of
+ # the proteins have gotten longer and the standard sql definition
+ # was no longer adequate. So, after the load failed, increase
+ # protein char and index size to 22:
+ cat << '_EOF_' > ensGtp.sql
+# This creates the table holding the relationship between
+# ensemble genes, transcripts, and peptides.
+CREATE TABLE ensGtp (
+ gene char(20) NOT NULL,
+ transcript char(20) NOT NULL,
+ protein char(23) NOT NULL,
+# INDICES
+ INDEX(gene(19)),
+ UNIQUE(transcript(19)),
+ INDEX(protein(23))
+)
+'_EOF_'
+ # << happy emacs
+
+ # Then, running the rest of the load script, with this line fixed up:
+ # hgLoadSqlTab mm9 ensGtp ensGtp.sql process/ensGtp.tab
+ ./finiLoad.csh > finiLoad.log 2>&1
+
+ cd /hive/data/genomes/mm9
+ doEnsGeneUpdate.pl -ensVersion=57 -verbose=2 -continue=cleanup \
+ mm9.ensGene.ra > ens.57.cleanup
+
+ featureBits mm9 ensGene
+ # 79248889 bases of 2620346127 (3.024%) in intersection
+
+############################################################################
+# hg19 - Human - Ensembl Genes version 57 (DONE - 2010-04-06 - hiram)
+ ssh hgwdev
+ cd /hive/data/genomes/hg19
+ cat << '_EOF_' > hg19.ensGene.ra
+# required db variable
+db hg19
+# optional nameTranslation, the sed command that will transform
+# Ensemble names to UCSC names. With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
+# optionally update the knownToEnsembl table after ensGene updated
+knownToEnsembl yes
+# optional haplotype lift-down from Ensembl full chrom coordinates
+# to UCSC simple haplotype coordinates
+haplotypeLift /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift
+# changing names for the odd bits in Ensembl 57
+liftUp /hive/data/genomes/hg19/jkStuff/ens.57.lft
+'_EOF_'
+# << happy emacs
+
+ doEnsGeneUpdate.pl -ensVersion=57 hg19.ensGene.ra
+ # four of their genes cross the boundaries of the haplotypes into
+ # the chromosomes. We can't do that, so, fixup doProcess.csh:
+ cd /hive/data/genomes/hg19/bed/ensGene.57/process
+ gunzip hg19.allGenes.gp.gz
+ egrep -v "ENST00000436611|ENST00000383191|ENST00000436232|ENST00000436870" \
+ hg19.allGenes.gp | gzip -c > hg19.allGenes.gp.gz
+ genePredCheck -db=hg19 hg19.allGenes.gp.gz
+ checked: 143123 failed: 0
+
+ cd /hive/data/genomes/hg19
+ # and finish it off:
+ doEnsGeneUpdate.pl -ensVersion=57 -continue=load \
+ hg19.ensGene.ra > ens.57.load 2>&1
+ featureBits hg19 ensGene
+ # 101913378 bases of 2897316137 (3.518%) in intersection
+
+############################################################################
# ensembl 56 update (DONE - 2009-10-27 - Hiram) only did Rat as a one-off
############################################################################
# rn4 - Rat - Ensembl Genes version 56 (DONE - 2009-10-27 - hiram)