src/hg/makeDb/doc/hg18.txt 1.400

1.400 2010/03/01 21:05:25 angie
gwasCatalog update
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.399
retrieving revision 1.400
diff -b -B -U 4 -r1.399 -r1.400
--- src/hg/makeDb/doc/hg18.txt	18 Feb 2010 20:28:39 -0000	1.399
+++ src/hg/makeDb/doc/hg18.txt	1 Mar 2010 21:05:25 -0000	1.400
@@ -29209,35 +29209,23 @@
 # load the table
     hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanOmni1_Quad snpArrayIlluminaHumanOmni1_Quad.tab -tab -sqlTable=snpArrayIlluminaHumanOmni1_Quad.sql
 
 #############################################################################
-# NHGRI GWAS CATALOG (DONE 1/19/10 angie)
+# NHGRI GWAS CATALOG (DONE 3/1/10)
+# Originally done 1/19/10
+# Area of possible future improvement: for SNPs that can't be mapped via our SNP track,
+# could some of them be obsolete IDs that have been merged into current IDs?
     mkdir /hive/data/genomes/hg18/bed/gwasCatalog
     cd /hive/data/genomes/hg18/bed/gwasCatalog
-    # Oops -- this link is supposed to produce .xls but it actually spits out
-    # an HTML table (which Excel can open so probably noone has complained):
-    wget -O gwas.100115.html http://www.genome.gov/gwastoexcel.cfm
-    # Make it tab-sep text:
-    perl -we 'while (<>) { \
-                if (! $gotTBody) { $gotTBody = (m/\<TBODY\>/); next; } \
-                chomp; s/\r$//; \
-                $addSpace = /\w+$/; \
-                if (s/\s*\<TR[^\>]*\>\s*//) { $rowStart = 1; }; \
-                $rowEnd = s/\s*\<\/TR[^\>]*\>\s*//; \
-                $itemStart = s/\s*\<TD[^\>]*\>\s*//; \
-                s/\s*\<\/TD[^\>]*\>\s*//; \
-                last if /\<\/TBODY/; \
-                if ($itemStart && $rowStart) { $rowStart = 0 } elsif ($itemStart) { print "\t" }; \
-                print unless (/^\s*$/); \
-                print " " if ($addSpace); \
-                print "\n" if ($rowEnd); \
-              }' \
-      gwas.100115.html \
-      > gwas.100115.tab
-    # Look at the column headers:
-    head -40 gwas.100115.html \
-    | perl -wpe 's/<\/TH>//; s/\s*<TH[^>]*>/# / || s/^.*\n$//'
-#  1 Date Added to Catalog (since 11/25/08)
+    # Done once, don't need to redo:
+    cut -f 1-4 ../snp130/snp130.bed \
+    | sort -k4,4 \
+    > snp130Coords.bed
+    mkdir /hive/data/genomes/hg18/bed/gwasCatalog/100301
+    cd /hive/data/genomes/hg18/bed/gwasCatalog/100301
+    wget http://www.genome.gov/admin/gwascatalog.txt
+    # Column headers:
+#  1 Date Added to Catalog
 #  2 PubMedID
 #  3 First Author
 #  4 Date
 #  5 Journal
@@ -29259,31 +29247,24 @@
 # 21 CNV
     # Columns of interest: pretty much all except for Date Added to the Catalog,
     # and Link which can be generated from PubMedID.  Watch out for these:
     # * Some rows don't name a SNP ("" or "NR") -- in that case, skip.
-    # * They are missing a pubmedID: 20031603 is the missing ID for Marroni,
-    #   "A Genome-Wide Association Scan of RR and QT Interval Duration in 3 European Genetically Isolated Populations: The EUROSPAN Project"
     # * One of their Reported Gene(s) has HTML: "HBB<br />"
     # * Risk allele is not always just a number, may have desc
     # * Missing data may be "", "NR", "NS" or "Pending"
-    # * p-Value (text) items seem to all end with "&nbsp;" (unless missing)
     # * Platform has some "Illumima", "Ilumina"
-    # + The CNV column is entirely [YN], yay!  use enum.
 
     # Use SNPs (comma-sep list) to map to genome coords, and strongest SNP-Risk Allele 
     # as bed 4+ name.
-    perl -we '%y2m = (January=>"01", February=>"02", March=>"03", April=>"04", May=>"05", \
-                      June=>"06", July=>"07", August=>"08", September=>"09", October=>"10", \
-                      November=>"11", December=>"12"); \
-              while (<>) { \
+    perl -we 'while (<>) { \
+                next if (/^\s*$/); \
                 @w = split("\t"); \
                 next if ($w[13] !~ /^rs\d+/); \
                 if ($w[1] eq "" && $w[2] eq "Marroni" && $w[6] =~ /EUROSPAN Project$/) { \
                   $w[1] = 20031603; \
                 } \
-                if ($w[3] =~ /^(\w+) (\d+), (\d+)$/) { # transform to mysql DATE \
+                if ($w[3] =~ /^(\d+)\/(\d+)\/(\d+)$/) { # transform to mysql DATE \
                   ($month, $day, $year) = ($1, $2, $3); \
-                  $month = $y2m{$month} || die "Cant find month ($month)\t"; \
                   $w[3] = "$year-$month-$day"; \
                 } else { die "Cant parse date ($w[3])\t" } \
                 $w[11] =~ s@<br />$@@; \
                 $w[16] =~ s/&nbsp;$//; \
@@ -29295,19 +29276,17 @@
                 foreach $s (@snps) { \
                   print join("\t", $s, @w); \
                 } \
               }' \
-      gwas.100115.tab \
-    | sort > noCoords.100115.tab
-    cut -f 1-4 ../snp130/snp130.bed \
-    | sort -k4,4 \
-    | join -t "	" -1 4 - noCoords.100115.tab \
+      gwascatalog.txt \
+    | sort > noCoords.tab
+    join -t "	" -1 4 ../snp130Coords.bed noCoords.tab \
         -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19 \
     | sort -k1,1 -k2n,2n \
         > gwasCatalog.bed
-
     hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
       -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
+#Loaded 2930 elements of size 22
 
 #############################################################################
 # CRG MAPABILITY (2010-01-19 - 2010-01-28, hartera, DONE)
 # Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca