src/hg/makeDb/doc/hg19.txt 1.98

1.98 2010/04/09 21:46:01 hiram
working on H-Inv 7.0 and new Ensembl name mapping table
Index: src/hg/makeDb/doc/hg19.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg19.txt,v
retrieving revision 1.97
retrieving revision 1.98
diff -b -B -U 4 -r1.97 -r1.98
--- src/hg/makeDb/doc/hg19.txt	9 Apr 2010 15:24:24 -0000	1.97
+++ src/hg/makeDb/doc/hg19.txt	9 Apr 2010 21:46:01 -0000	1.98
@@ -2034,8 +2034,9 @@
     #	16571
 
 ##############################################################################
 # UCSC to Ensembl chr name mapping (DONE - 2009-05-08 - Hiram)
+    #	new names as of Ensembl version 57, see below
     mkdir /hive/data/genomes/hg19/ensembl
     cd /hive/data/genomes/hg19/ensembl
     wget --timestamping \
 	'ftp://ftp.ensembl.org/pub/pre/homo_sapiens/GRCh37/dna/*'
@@ -8979,4 +8980,72 @@
     featureBits hg19 vegaPseudoGene
     # 8494715 bases of 2897316137 (0.293%) in intersection
 
 ######################################################################## 
+# H-Inv 7.0 Gene track (DONE - 2010-04-07 - Hiram)
+    mkdir /hive/data/genomes/hg19/bed/hinv
+    cd /hive/data/genomes/hg19/bed/hinv
+    ./hinvToBed12.pl go > broken.1.exons.txt
+    hgLoadBed hg19 hinv70Coding fcdna.coding.bed
+    #	Loaded 272257 elements of size 12
+    featureBits hg19 hinv70Coding
+    #	141717797 bases of 2897316137 (4.891%) in intersection
+
+    hgLoadBed hg19 hinv70NonCoding fcdna.nonCoding.bed
+    #	Loaded 22625 elements of size 12
+    featureBits hg19 hinv70NonCoding 
+    #	1350960 bases of 2897316137 (0.047%) in intersection
+
+    hgLoadBed hg19 hinv70PseudoGene fcdna.pseudoGene.bed
+    #	Loaded 1166 elements of size 12
+    featureBits hg19 hinv70PseudoGene
+    #	1701647 bases of 2897316137 (0.059%) in intersection
+
+    featureBits hg19 hinv70Coding hinv70PseudoGene
+    #	619377 bases of 2897316137 (0.021%) in intersection
+
+    featureBits hg19 hinv70Coding hinv70NonCoding
+    #	912553 bases of 2897316137 (0.031%) in intersection
+
+    featureBits hg19 hinv70PseudoGene hinv70NonCoding
+    #	9642 bases of 2897316137 (0.000%) in intersection
+
+######################################################################## 
+# Updating the ucscToEnsembl table (DONE - 2010-04-06 - Hiram)
+    #	as of Ensembl V57, their naming scheme changed for the randoms
+    cd /hive/data/genomes/hg19/bed/ucscToEnsembl
+cat ../../chrom.sizes | while read L
+do
+    size=`echo $L | awk '{print $2}'`
+    ucName=`echo $L | awk '{print $1}'`
+    ensName=`echo $ucName | sed -e "s/^chrM/MT/; s/^chr//;"`
+    case $ucName in
+        chr17_ctg5_hap1) ensName="HSCHR17_1"
+                ;;
+        chr4_ctg9_hap1) ensName="HSCHR4_1"
+                ;;
+        chr6_apd_hap1) ensName="HSCHR6_MHC_APD"
+                ;;
+        chr6_cox_hap2) ensName="HSCHR6_MHC_COX"
+                ;;
+        chr6_dbb_hap3) ensName="HSCHR6_MHC_DBB"
+                ;;
+        chr6_mann_hap4) ensName="HSCHR6_MHC_MANN"
+                ;;
+        chr6_mcf_hap5) ensName="HSCHR6_MHC_MCF"
+                ;;
+        chr6_qbl_hap6) ensName="HSCHR6_MHC_QBL"
+                ;;
+        chr6_ssto_hap7) ensName="HSCHR6_MHC_SSTO"
+                ;;
+        *_gl*)
+ensName=`echo $L | awk '{print $1}' | sed -e "s/^chr.*_gl/GL/; s/_random//"`
+                ;;
+    esac
+    echo -e "$ucName\t$ensName"
+done > ucscToEnsemblV57.tab
+
+    hgsql hg19 -e 'delete from ucscToEnsembl where ucsc like "%";'
+    hgsql hg19 -e \
+'LOAD DATA LOCAL INFILE "ucscToEnsemblV57.tab" INTO TABLE ucscToEnsembl'
+
+########################################################################