src/hg/makeDb/doc/hg18.txt 1.394

1.394 2010/01/19 21:44:30 angie
Added NHGRI GWAS catalog track.
Index: src/hg/makeDb/doc/hg18.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hg18.txt,v
retrieving revision 1.393
retrieving revision 1.394
diff -b -B -U 4 -r1.393 -r1.394
--- src/hg/makeDb/doc/hg18.txt	5 Jan 2010 01:14:06 -0000	1.393
+++ src/hg/makeDb/doc/hg18.txt	19 Jan 2010 21:44:30 -0000	1.394
@@ -28690,8 +28690,37 @@
 
 # Load up the bed graph tracks
 hgLoadBed -bedGraph=4 hg18 FantomCageForwardPowerLawGraph all.forward.plaw.bg2
 hgLoadBed -bedGraph=4 hg18 FantomCageReversePowerLawGraph all.reverse.plaw.bg2
+
+
+############################################################################
+# ENCODE PHASED GENOTYPES for NA12878 (DONE 7/22/09 angie)
+    mkdir /hive/data/genomes/hg18/bed/phasedGenotypesNA12878
+    cd /hive/data/genomes/hg18/bed/phasedGenotypesNA12878
+    wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/CEU.trio.dec.with.x.with.rs.calls
+    wget http://illumina-mac.stanford.edu/NA12878_Reference_Genome/code/{Makefile,PhaseSNPs.pm}
+
+#TODO: strip homozyg-same-as-reference SNPs from CEU.trio, then make:
+
+    make NA12878_SNPs_Phased.bed
+    perl -wpe '/^(\w+)\t(\d+)\t(\d+)\t([ACGT])\/([ACGT])\t([MP\/HA]+)$/ || die "parse\n$_\t"; \
+      ($c, $s, $e, $a1, $a2, $t) = ($1, $2, $3, $4, $5, $6); \
+      if ($t eq "M/P") { \
+        $_ = "$c\t$s\t$e\tM:$a1\n" . "$c\t$s\t$e\tP:$a2\n"; \
+      } elsif ($t eq "P/M") { \
+        $_ = "$c\t$s\t$e\tM:$a2\n" . "$c\t$s\t$e\tP:$a1\n"; \
+      } elsif ($t eq "H") { \
+        $_ = "$c\t$s\t$e\t$a1\n"; \
+      } elsif ($t eq "A") { \
+        $_ = "$c\t$s\t$e\tA:$a1/$a2\n"; \
+      } else { die "unrec type $t"; } \
+      ' NA12878_SNPs_Phased.bed \
+      > phasedGenotypesNA12878.bed
+    hgLoadBed -noNameIx hg18 phasedGenotypesNA12878 phasedGenotypesNA12878.bed
+#Loaded 5469032 elements of size 4
+
+
 ############################################################################
 # TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)
 
 vertebrate-wide transMap alignments were built  Tracks are created and loaded
@@ -28954,8 +28983,9 @@
     sed -e 's/read_name:\s//' $file > ${subTrack}.gff
     ldHgGene -exon=read $DATABASE ${subTrack} ${subTrack}.gff 
 done
 '_EOF_'
+    # << emacs
    chmod +x formatAndLoadData
    ./formatAndLoadData burgeDataFiles.txt burgeRnaSeqGemMapperAlign hg18 \
      > load.log &
    # Added a trackDb entry in 
@@ -29149,4 +29179,106 @@
 
 # load the table
     hgLoadBed -allowStartEqualEnd hg18 snpArrayIlluminaHumanOmni1_Quad snpArrayIlluminaHumanOmni1_Quad.tab -tab -sqlTable=snpArrayIlluminaHumanOmni1_Quad.sql
 
+
+#############################################################################
+# NHGRI GWAS CATALOG (DONE 1/19/10 angie)
+    mkdir /hive/data/genomes/hg18/bed/gwasCatalog
+    cd /hive/data/genomes/hg18/bed/gwasCatalog
+    # Oops -- this link is supposed to produce .xls but it actually spits out
+    # an HTML table (which Excel can open so probably noone has complained):
+    wget -O gwas.100115.html http://www.genome.gov/gwastoexcel.cfm
+    # Make it tab-sep text:
+    perl -we 'while (<>) { \
+                if (! $gotTBody) { $gotTBody = (m/\<TBODY\>/); next; } \
+                chomp; s/\r$//; \
+                $addSpace = /\w+$/; \
+                if (s/\s*\<TR[^\>]*\>\s*//) { $rowStart = 1; }; \
+                $rowEnd = s/\s*\<\/TR[^\>]*\>\s*//; \
+                $itemStart = s/\s*\<TD[^\>]*\>\s*//; \
+                s/\s*\<\/TD[^\>]*\>\s*//; \
+                last if /\<\/TBODY/; \
+                if ($itemStart && $rowStart) { $rowStart = 0 } elsif ($itemStart) { print "\t" }; \
+                print unless (/^\s*$/); \
+                print " " if ($addSpace); \
+                print "\n" if ($rowEnd); \
+              }' \
+      gwas.100115.html \
+      > gwas.100115.tab
+    # Look at the column headers:
+    head -40 gwas.100115.html \
+    | perl -wpe 's/<\/TH>//; s/\s*<TH[^>]*>/# / || s/^.*\n$//'
+#  1 Date Added to Catalog (since 11/25/08)
+#  2 PubMedID
+#  3 First Author
+#  4 Date
+#  5 Journal
+#  6 Link
+#  7 Study
+#  8 Disease/Trait
+#  9 Initial Sample Size
+# 10 Replication Sample Size
+# 11 Region
+# 12 Reported Gene(s)
+# 13 Strongest SNP-Risk Allele
+# 14 SNPs
+# 15 Risk Allele Frequency
+# 16 p-Value
+# 17 p-Value (text)
+# 18 OR or beta
+# 19 95% CI (text)
+# 20 Platform [SNPs passing QC]
+# 21 CNV
+    # Columns of interest: pretty much all except for Date Added to the Catalog,
+    # and Link which can be generated from PubMedID.  Watch out for these:
+    # * Some rows don't name a SNP ("" or "NR") -- in that case, skip.
+    # * They are missing a pubmedID: 20031603 is the missing ID for Marroni,
+    #   "A Genome-Wide Association Scan of RR and QT Interval Duration in 3 European Genetically Isolated Populations: The EUROSPAN Project"
+    # * One of their Reported Gene(s) has HTML: "HBB<br />"
+    # * Risk allele is not always just a number, may have desc
+    # * Missing data may be "", "NR", "NS" or "Pending"
+    # * p-Value (text) items seem to all end with "&nbsp;" (unless missing)
+    # * Platform has some "Illumima", "Ilumina"
+    # + The CNV column is entirely [YN], yay!  use enum.
+
+    # Use SNPs (comma-sep list) to map to genome coords, and strongest SNP-Risk Allele 
+    # as bed 4+ name.
+    perl -we '%y2m = (January=>"01", February=>"02", March=>"03", April=>"04", May=>"05", \
+                      June=>"06", July=>"07", August=>"08", September=>"09", October=>"10", \
+                      November=>"11", December=>"12"); \
+              while (<>) { \
+                @w = split("\t"); \
+                next if ($w[13] !~ /^rs\d+/); \
+                if ($w[1] eq "" && $w[2] eq "Marroni" && $w[6] =~ /EUROSPAN Project$/) { \
+                  $w[1] = 20031603; \
+                } \
+                if ($w[3] =~ /^(\w+) (\d+), (\d+)$/) { # transform to mysql DATE \
+                  ($month, $day, $year) = ($1, $2, $3); \
+                  $month = $y2m{$month} || die "Cant find month ($month)\t"; \
+                  $w[3] = "$year-$month-$day"; \
+                } else { die "Cant parse date ($w[3])\t" } \
+                $w[11] =~ s@<br />$@@; \
+                $w[16] =~ s/&nbsp;$//; \
+                $w[19] =~ s/^(Illumima|Ilumina)/Illumina/; \
+                my @snps = split(",", $w[13]); \
+                foreach $i (13, 5, 0) { # discard columns (use descending order) \
+                  splice(@w, $i, 1); \
+                } \
+                foreach $s (@snps) { \
+                  print join("\t", $s, @w); \
+                } \
+              }' \
+      gwas.100115.tab \
+    | sort > noCoords.100115.tab
+    cut -f 1-4 ../snp130/snp130.bed \
+    | sort -k4,4 \
+    | join -t "	" -1 4 - noCoords.100115.tab \
+        -o 1.1,1.2,1.3,1.4,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12,2.13,2.14,2.15,2.16,2.17,2.18,2.19 \
+    | sort -k1,1 -k2n,2n \
+        > gwasCatalog.bed
+
+    hgLoadBed hg18 gwasCatalog gwasCatalog.bed \
+      -tab -sqlTable=$HOME/kent/src/hg/lib/gwasCatalog.sql -notItemRgb -allowStartEqualEnd
+
+
+#############################################################################