src/hg/makeDb/doc/mm9.txt 1.103

1.103 2009/07/31 04:17:56 hartera
Updated Vega Genes track to Build 35 from March 2009.
Index: src/hg/makeDb/doc/mm9.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/mm9.txt,v
retrieving revision 1.102
retrieving revision 1.103
diff -b -B -U 4 -r1.102 -r1.103
--- src/hg/makeDb/doc/mm9.txt	21 Jul 2009 21:01:44 -0000	1.102
+++ src/hg/makeDb/doc/mm9.txt	31 Jul 2009 04:17:56 -0000	1.103
@@ -9486,4 +9486,49 @@
    svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01
 
 see doc/builds.txt for specific details.
 ############################################################################
+# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-30, hartera)
+# Needs updating as the current version is build 31 from May 2008.
+
+   # Download the VEGA genes for mouse from the ftp site
+   # This file is from 03/17/09.
+   wget --timestamping \
+        "ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
+   # add chr in front of chromosome names and lift up the randoms
+   #    processing similar to the same processing for Ensembl genes,
+   #    from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
+   cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
+   zcat gtf_file.gz \
+        | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
+        | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
+        | gzip > allGenes.gtf.gz
+   # Got 189 lifts in randoms.mm9.lift
+
+   gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
+	| gzip > mm9.allGenes.gp.gz
+   /cluster/home/hartera/kent/src/hg/utils/automation/extractGtf.pl \
+	infoOut.txt > ensGtp.tab
+   genePredCheck -db=mm9 mm9.allGenes.gp.gz
+   # checked: 59381 failed: 0
+   zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
+   zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
+   
+   # Modify the GTF files so that the gene name goes into the 
+   # name2 field of the genePred. 
+   perl -pi.bak -e 's/gene_id/other_gene_id/' *pseudo.gtf
+   perl -pi.bak -e 's/gene_name/gene_id/' *pseudo.gtf
+   gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
+   gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
+
+   genePredCheck -db=mm9 pseudo.gp
+    # checked: 4305 failed: 0§
+   genePredCheck -db=mm9 not.pseudo.gp
+    # checked: 55076 failed: 0
+
+   hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
+   hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
+
+   # clean up
+   rm *.bak
+
+############################################################################