src/hg/makeDb/doc/hgdpGeo.txt 1.1

1.1 2009/02/10 17:59:18 angie
Per-SNP geographic map data and scripts from Human Genome Diversity Project.
Index: src/hg/makeDb/doc/hgdpGeo.txt
===================================================================
RCS file: src/hg/makeDb/doc/hgdpGeo.txt
diff -N src/hg/makeDb/doc/hgdpGeo.txt
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/makeDb/doc/hgdpGeo.txt	10 Feb 2009 17:59:18 -0000	1.1
@@ -0,0 +1,127 @@
+# for emacs: -*- mode: sh; -*-
+
+# SNP geographic data from the Human Genome Diversity Project --
+# ancestral allele frequencies for 53 populations worldwide.
+# I'm adapting scripts by John Novembre, jnovembre at ucla,
+# and Joe Pickrell, pickrell at uchicago.
+
+###########################################################################
+# (2/3/09 - 2/5/09 angie)
+    mkdir /hive/data/outside/hgdpGeo
+    cd /hive/data/outside/hgdpGeo
+    # Saved files emailed from Joe Pickrell, pickrell at uchicago:
+    # HGDP_HapMap.coords.txt, HGDP_HapMap.coords_adj.txt, HGDP_supp.pdf,
+    # illumina_snps.gz and plotFreqs.sh.  Download allele frequency files:
+    set i = 1
+    while ($i <= 22)
+      wget --timestamping \
+        http://hgdp.uchicago.edu/data/Alfreqs/chr$i.popfreqs.ancfrq.strat.gz
+      set i = `expr $i + 1`
+    end
+    wget --timestamping -O chrX.popfreqs.ancfrq.strat.gz \
+      http://hgdp.uchicago.edu/data/Alfreqs/chr23.popfreqs.ancfreq.strat.gz
+    # Broken down by chroms... confirm that rsID sections are unique:
+    zcat chr*.gz | cut -f 2 | uniq > rsIDs.lst
+    wc -l rsIDs.lst
+#657000 rsIDs.lst
+    sort -u rsIDs.lst | wc -l
+#657000
+
+    # The rsIDs in illumina_snps.gz were directly genotyped -- other rsIDs
+    # were imputed from HapMap data (not as high-confidence).  How many
+    # of the SNPs were imputed?
+    zcat illumina_snps.gz | grep -vFwf - rsIDs.lst | wc -l
+#0
+    # So there are no imputed SNPs in the chr*.strat files! -- confirmed w/Joe.
+
+    # How many populations, on average, does each rsID have data for?
+    zcat chr*.gz | wc -l
+#34821000
+    calc 34821000 / 657000
+#34821000 / 657000 = 53.000000
+    # Wow, that is an awfully round number.  Of the 59 populations for
+    # which we have coords, how many appear in the frequency files?
+    zcat chr*.gz | cut -f 3 | sort -u > popsWithData.txt
+    wc -l popsWithData.txt
+#53 popsWithData.txt
+    # OK, we can assume exactly 53.  Which populations don't we have data for?
+    grep -vFwf popsWithData.txt HGDP_HapMap.coords_adj.txt 
+#Population Latitude[DegreesNorth] Longitude[DegreesEast]
+#Herero -22 19
+#Ovambo -19 18
+#Pedi -29 30
+#Sotho -29 29
+#Tswana -28 24
+#Zulu -28 31
+
+    # Generate a perl hash of populations w/data => alphabetical index
+    # for storing frequencies in an array of floats:
+    grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
+    | perl -wpe 'chomp; @w=split; $index = ($. - 1); \
+                 $_ = "\t \"$w[0]\" => $index,\n"; \
+                 s/\t \"Adygei/\%pops = (\"Adygei/; s/(Yoruba.*),/$1 );/;' \
+      > popHash.pl
+    # Generate a C array of structs that encode population name, lat and long
+    # for plotting:
+    grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
+    | perl -wpe 'chomp; @w=split; \
+                 $_ = "    { \"$w[0]\", $w[1], $w[2] },\n"; \
+                 s/(    { \"Adygei)/struct hgdpPopInfo pops[] =\n{\n$1/; s/(Yoruba.*),/$1\n};/;' \
+      > popInfo.c
+    # Now distill the frequency data into a tab-sep text file sorted by rsId.
+    # This can be joined with snp12X.txt (sorted by rsId) to prepend chrom, 
+    # chromStart and chromEnd to make it a track table.
+    zcat chr*.gz \
+    | perl -we 'use vars "%pops"; eval `cat popHash.pl`; \
+      while (<>) { \
+        (undef, $rsId, $pop, $anc, $der, $freq, undef, undef) = split; \
+        if (! defined $lastRs || $lastRs ne $rsId) { \
+          if (defined $lastRs) { \
+            die "Wrong number of freqs for $rsId: " . scalar(@freqs) if (@freqs != 53); \
+            $freqs = join(",", @freqs) . ","; \
+            print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n"; \
+          } \
+          ($lastRs, $lastAnc, $lastDer, @freqs) = ($rsId, $anc, $der, ()); \
+        } \
+        $index = $pops{$pop}; \
+        die "No index for pop $pop" unless (defined $index); \
+        $freqs[$index] = sprintf("%.4f", $freq); \
+        $freqs[$index] =~ s/([0-9])0+$/$1/; \
+      } \
+      $freqs = join(",", @freqs) . ","; \
+      print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n";' \
+    | sort \
+      > hgdpGeoCoordless.txt
+    # Took ~5min.
+    wc -l hgdpGeoCoordless.txt
+#657000 hgdpGeoCoordless.txt
+
+    # plotFreqs.sh requires Generic Mapping Tools -- installed in
+    # /hive/data/outside/GMT4.3.1 using a GMTparam.txt generated by
+    # the install form on http://gmt.soest.hawaii.edu/ .
+    # Add /hive/data/outside/GMT4.3.1/bin to PATH
+    # Add /hive/data/outside/GMT4.3.1/man to MANPATH
+    # I split plotFreqs.sh into two parts: plotMapBackground.sh, to be
+    # run once, and plotFreqsOnly.sh, to be run on each rsID.
+    ./plotMapBackground.sh
+    foreach rsID (rs5939319 rs5982868 rs11152550)
+      zcat chrX.popfreqs.ancfrq.strat.gz | head -3000 | grep $rsID > $rsID
+      time plotFreqs_UCSC.sh $rsID
+    end
+    # plotFreqsOnly is still kind of slow.  Instead of running the ps-generating
+    # commands every time, run once and use as templates for C fprintf:
+    set RESETPOS = "-X-7.25i -Y-4.75i"
+    echo "0.25 6.25 14 0 1 LT SNP: %s" \
+    | pstext -R0/9/0/6.5 -Jx1i $RESETPOS -O -K \
+    | sed -e 's/^/"/; s/$/\\n"/;' > rs.template
+    echo "0.25 6.00 14 0 1 LT Ancestral Allele: %c" \
+    | pstext -R0/9/0/6.5 -Jx1i -O -K -Gblue \
+    | sed -e 's/^/"/; s/$/\\n"/;' > a1.template
+    echo "0.25 5.75 14 0 1 LT Derived Allele: %c" \
+    | pstext -R0/9/0/6.5 -Jx1i -O -K -Gorange \
+    | sed -e 's/^/"/; s/$/\\n"/;' > a2.template
+    # I was not up to reverse-engineering the geographic projection
+    # used in psxy so I think we will have to just run that in hgc.c, oh
+    # well.
+
+###########################################################################