src/hg/makeDb/doc/makeEnsembl.txt 1.17

1.17 2010/05/27 22:53:51 hiram
v58 done for hg19,mm9,rn4
Index: src/hg/makeDb/doc/makeEnsembl.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/makeEnsembl.txt,v
retrieving revision 1.16
retrieving revision 1.17
diff -b -B -U 4 -r1.16 -r1.17
--- src/hg/makeDb/doc/makeEnsembl.txt	7 Apr 2010 17:43:30 -0000	1.16
+++ src/hg/makeDb/doc/makeEnsembl.txt	27 May 2010 22:53:51 -0000	1.17
@@ -5,9 +5,124 @@
 #	Robert's experiments with an automated process.
 #
 
 ############################################################################
-# ensembl 57 update (WORKING - 2010-04-02 - Hiram)
+# ensembl 58 update (WORKING - 2010-05-27 - Hiram)
+
+############################################################################
+#  mm9 - Mouse - Ensembl Genes version 58  (DONE - 2010-05-27 - hiram)
+    cd /hive/data/genomes/mm9
+    cat << '_EOF_' > mm9.ensGene.ra
+# required db variable
+db mm9
+# optional liftRandoms yes/no or absent
+liftRandoms yes
+# optional nameTranslation, the sed command that will transform
+#       Ensemble names to UCSC names.  With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
+# optionally update the knownToEnsembl table after ensGene updated
+knownToEnsembl yes
+'_EOF_'
+#  << happy emacs
+
+    doEnsGeneUpdate.pl -ensVersion=58 mm9.ensGene.ra
+
+    cd /hive/data/genomes/mm9/bed/ensGene.58
+    #	ran into trouble with the ensGtp table load, the names of
+    #	the proteins have gotten longer and the standard sql definition
+    #	was no longer adequate.  So, after the load failed, increase
+    #	protein char and index size to 22:
+    cat << '_EOF_' > ensGtp.sql
+# This creates the table holding the relationship between
+# ensemble genes, transcripts, and peptides.
+CREATE TABLE ensGtp (
+  gene char(20) NOT NULL,
+  transcript char(20) NOT NULL,
+  protein char(23) NOT NULL,
+# INDICES
+  INDEX(gene(19)),
+  UNIQUE(transcript(19)),
+  INDEX(protein(23))
+) 
+'_EOF_'
+    # << happy emacs
+
+    #	Then, running the rest of the load script, with this line fixed up:
+    #	hgLoadSqlTab mm9 ensGtp ensGtp.sql process/ensGtp.tab
+    ./finiLoad.csh > finiLoad.log 2>&1
+
+    cd /hive/data/genomes/mm9
+    doEnsGeneUpdate.pl -ensVersion=58 -verbose=2 -continue=cleanup \
+	mm9.ensGene.ra > ens.58.cleanup
+
+    featureBits mm9 ensGene
+    #	79466978 bases of 2620346127 (3.033%) in intersection
+
+############################################################################
+#  hg19 - Human - Ensembl Genes version 58  (DONE - 2010-04-06 - hiram)
+    ssh hgwdev
+    cd /hive/data/genomes/hg19
+    cat << '_EOF_' > hg19.ensGene.ra
+# required db variable
+db hg19
+# optional nameTranslation, the sed command that will transform
+#       Ensemble names to UCSC names.  With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/; s/^Un/chrUn/"
+# optionally update the knownToEnsembl table after ensGene updated
+knownToEnsembl yes
+# optional haplotype lift-down from Ensembl full chrom coordinates
+#       to UCSC simple haplotype coordinates
+haplotypeLift /hive/data/genomes/hg19/jkStuff/ensGene.haplotype.lift
+# changing names for the odd bits in Ensembl 58
+liftUp /hive/data/genomes/hg19/jkStuff/ens.57.lft
+'_EOF_'
+#  << happy emacs
+
+    doEnsGeneUpdate.pl  -ensVersion=58 hg19.ensGene.ra
+    #	New problems in v58: LRG genome sequence, see also:
+    #	http://www.lrg-sequence.org/
+    #	and as was in v57, four of their genes cross the boundaries of
+    #	the haplotypes into the chromosomes.
+    #	We can't manage these, so, fixup doProcess.csh:
+    cd /hive/data/genomes/hg19/bed/ensGene.58/process
+    gunzip hg19.allGenes.gp.gz
+    egrep -v "LRG_|ENST00000436611|ENST00000436232|ENST00000436870" \
+        hg19.allGenes.gp | gzip -c > hg19.allGenes.gp.gz
+    genePredCheck -db=hg19 hg19.allGenes.gp.gz
+    checked: 151222 failed: 0
+    mv hg19.allGenes.gp hg19.allGenes.gp.broken
+
+    cd /hive/data/genomes/hg19
+    #	and finish it off:
+    doEnsGeneUpdate.pl -ensVersion=58 -continue=load \
+	hg19.ensGene.ra > ens.58.load 2>&1
+    featureBits hg19 ensGene
+    # 104402177 bases of 2897316137 (3.603%) in intersection
+
+############################################################################
+#  rn4 - Rat - Ensembl Genes version 58  (DONE - 2010-05-27 - hiram)
+    ssh hgwdev
+    cd /hive/data/genomes/rn4
+    cat << '_EOF_' > rn4.ensGene.ra
+# required db variable
+db rn4
+# optional nameTranslation, the sed command that will transform
+#       Ensemble names to UCSC names.  With quotes just to make sure.
+nameTranslation "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/"
+# optionally update the knownToEnsembl table after ensGene updated
+knownToEnsembl yes
+'_EOF_'
+#  << happy emacs
+
+    doEnsGeneUpdate.pl  -ensVersion=58 rn4.ensGene.ra
+    ssh hgwdev
+    cd /hive/data/genomes/rn4/bed/ensGene.58
+    featureBits rn4 ensGene
+    # 46705616 bases of 2571531505 (1.816%) in intersection
+
+############################################################################
+############################################################################
+# ensembl 57 update (DONE - 2010-04-02 - Hiram)
 
 ############################################################################
 #  susScr1 - Pig - Ensembl Genes version 57  (DONE - 2010-04-05 - hiram)
     ssh hgwdev