src/hg/makeDb/doc/hg18.txt 1.417

1.417 2010/05/12 23:46:11 angie
Updated gwasCatalog.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.416
retrieving revision 1.417
diff -b -B -U 4 -r1.416 -r1.417
--- src/hg/makeDb/doc/hg18.txt	12 May 2010 22:57:25 -0000	1.416
+++ src/hg/makeDb/doc/hg18.txt	12 May 2010 23:46:11 -0000	1.417
@@ -29598,9 +29598,10 @@
 # load the table
     hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanOmni1_Quad snpArrayIlluminaHumanOmni1_Quad.tab -tab -sqlTable=snpArrayIlluminaHumanOmni1_Quad.sql
 
 #############################################################################
-# NHGRI GWAS CATALOG (DONE 4/1/10)
+# NHGRI GWAS CATALOG (DONE 5/12/10)
+# Updated 4/1/10
 # Updated 3/1/10
 # Originally done 1/19/10
 # Area of possible future improvement: for SNPs that can't be mapped via our SNP track,
 # could some of them be obsolete IDs that have been merged into current IDs?
@@ -29609,10 +29610,10 @@
     # Done once, don't need to redo:
     cut -f 1-4 ../snp130/snp130.bed \
     | sort -k4,4 \
     > snp130Coords.bed
-    mkdir /hive/data/genomes/hg18/bed/gwasCatalog/100401
-    cd /hive/data/genomes/hg18/bed/gwasCatalog/100401
+    mkdir /hive/data/genomes/hg18/bed/gwasCatalog/100512
+    cd /hive/data/genomes/hg18/bed/gwasCatalog/100512
     wget http://www.genome.gov/admin/gwascatalog.txt
     # Column headers:
 #  1 Date Added to Catalog
 #  2 PubMedID
@@ -29637,26 +29638,23 @@
 # 21 CNV
     # Columns of interest: pretty much all except for Date Added to the Catalog,
     # and Link which can be generated from PubMedID.  Watch out for these:
     # * Some rows don't name a SNP ("" or "NR") -- in that case, skip.
-    # * One of their Reported Gene(s) has HTML: "HBB<br />"
     # * Risk allele is not always just a number, may have desc
     # * Missing data may be "", "NR", "NS" or "Pending"
-    # * Platform has some "Illumima", "Ilumina"
 
     # Use SNPs (comma-sep list) to map to genome coords, and strongest SNP-Risk Allele 
     # as bed 4+ name.
     perl -we 'while (<>) { \
                 next if (/^\s*$/); \
+                s/\r$//; \
                 @w = split("\t"); \
                 next if ($w[13] !~ /^rs\d+/); \
                 if ($w[3] =~ /^(\d+)\/(\d+)\/(\d+)$/) { # transform to mysql DATE \
                   ($month, $day, $year) = ($1, $2, $3); \
                   $w[3] = "$year-$month-$day"; \
                 } else { die "Cant parse date ($w[3])\t" } \
-                $w[11] =~ s@<br />$@@; \
-                $w[16] =~ s/&nbsp;$//; \
-                $w[19] =~ s/^(Illumima|Ilumina)/Illumina/; \
+                $w[13] =~ s/ //g; \
                 my @snps = split(",", $w[13]); \
                 foreach $i (13, 5, 0) { # discard columns (use descending order) \
                   splice(@w, $i, 1); \
                 } \
@@ -29671,9 +29669,9 @@
     | sort -k1,1 -k2n,2n \
         > gwasCatalog.bed
     hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
       -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
-#Loaded 3051 elements of size 22
+#Loaded 3461 elements of size 22
 
     # For David: find examples of risk alleles for which dbSNP observed
     # alleles are complementary (A/T or C/G) -- how do we know what strand the
     # risk allele is on??  -- asking corresp. author Teri Manolio.
@@ -29681,9 +29679,9 @@
                    from gwasCatalog as gc, snp130 as snp \
                    where gc.riskAllele rlike "^rs[0-9]+-[ACGT]" and \
                          gc.name = snp.name and snp.observed in ("C/G", "A/T") \
                    order by gc.name limit 20;'
-    # count(*) = 150
+    # count(*) = 170
 
 
 #############################################################################
 # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)