src/hg/makeDb/doc/hgdpGeo.txt 1.2

1.2 2009/03/30 22:15:32 angie
Made a population key image (thanks b0b for suggestion).
Index: src/hg/makeDb/doc/hgdpGeo.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hgdpGeo.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/makeDb/doc/hgdpGeo.txt	10 Feb 2009 17:59:18 -0000	1.1
+++ src/hg/makeDb/doc/hgdpGeo.txt	30 Mar 2009 22:15:32 -0000	1.2
@@ -1,127 +1,134 @@
 # for emacs: -*- mode: sh; -*-
 
 # SNP geographic data from the Human Genome Diversity Project --
 # ancestral allele frequencies for 53 populations worldwide.
 # I'm adapting scripts by John Novembre, jnovembre at ucla,
 # and Joe Pickrell, pickrell at uchicago.
 
 ###########################################################################
 # (2/3/09 - 2/5/09 angie)
     mkdir /hive/data/outside/hgdpGeo
     cd /hive/data/outside/hgdpGeo
     # Saved files emailed from Joe Pickrell, pickrell at uchicago:
     # HGDP_HapMap.coords.txt, HGDP_HapMap.coords_adj.txt, HGDP_supp.pdf,
     # illumina_snps.gz and plotFreqs.sh.  Download allele frequency files:
     set i = 1
     while ($i <= 22)
       wget --timestamping \
         http://hgdp.uchicago.edu/data/Alfreqs/chr$i.popfreqs.ancfrq.strat.gz
       set i = `expr $i + 1`
     end
     wget --timestamping -O chrX.popfreqs.ancfrq.strat.gz \
       http://hgdp.uchicago.edu/data/Alfreqs/chr23.popfreqs.ancfreq.strat.gz
     # Broken down by chroms... confirm that rsID sections are unique:
     zcat chr*.gz | cut -f 2 | uniq > rsIDs.lst
     wc -l rsIDs.lst
 #657000 rsIDs.lst
     sort -u rsIDs.lst | wc -l
 #657000
 
     # The rsIDs in illumina_snps.gz were directly genotyped -- other rsIDs
     # were imputed from HapMap data (not as high-confidence).  How many
     # of the SNPs were imputed?
     zcat illumina_snps.gz | grep -vFwf - rsIDs.lst | wc -l
 #0
     # So there are no imputed SNPs in the chr*.strat files! -- confirmed w/Joe.
 
     # How many populations, on average, does each rsID have data for?
     zcat chr*.gz | wc -l
 #34821000
     calc 34821000 / 657000
 #34821000 / 657000 = 53.000000
     # Wow, that is an awfully round number.  Of the 59 populations for
     # which we have coords, how many appear in the frequency files?
     zcat chr*.gz | cut -f 3 | sort -u > popsWithData.txt
     wc -l popsWithData.txt
 #53 popsWithData.txt
     # OK, we can assume exactly 53.  Which populations don't we have data for?
     grep -vFwf popsWithData.txt HGDP_HapMap.coords_adj.txt 
 #Population Latitude[DegreesNorth] Longitude[DegreesEast]
 #Herero -22 19
 #Ovambo -19 18
 #Pedi -29 30
 #Sotho -29 29
 #Tswana -28 24
 #Zulu -28 31
 
     # Generate a perl hash of populations w/data => alphabetical index
     # for storing frequencies in an array of floats:
     grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
     | perl -wpe 'chomp; @w=split; $index = ($. - 1); \
                  $_ = "\t \"$w[0]\" => $index,\n"; \
                  s/\t \"Adygei/\%pops = (\"Adygei/; s/(Yoruba.*),/$1 );/;' \
       > popHash.pl
     # Generate a C array of structs that encode population name, lat and long
     # for plotting:
     grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
     | perl -wpe 'chomp; @w=split; \
                  $_ = "    { \"$w[0]\", $w[1], $w[2] },\n"; \
                  s/(    { \"Adygei)/struct hgdpPopInfo pops[] =\n{\n$1/; s/(Yoruba.*),/$1\n};/;' \
       > popInfo.c
     # Now distill the frequency data into a tab-sep text file sorted by rsId.
     # This can be joined with snp12X.txt (sorted by rsId) to prepend chrom, 
     # chromStart and chromEnd to make it a track table.
     zcat chr*.gz \
     | perl -we 'use vars "%pops"; eval `cat popHash.pl`; \
       while (<>) { \
         (undef, $rsId, $pop, $anc, $der, $freq, undef, undef) = split; \
         if (! defined $lastRs || $lastRs ne $rsId) { \
           if (defined $lastRs) { \
             die "Wrong number of freqs for $rsId: " . scalar(@freqs) if (@freqs != 53); \
             $freqs = join(",", @freqs) . ","; \
             print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n"; \
           } \
           ($lastRs, $lastAnc, $lastDer, @freqs) = ($rsId, $anc, $der, ()); \
         } \
         $index = $pops{$pop}; \
         die "No index for pop $pop" unless (defined $index); \
         $freqs[$index] = sprintf("%.4f", $freq); \
         $freqs[$index] =~ s/([0-9])0+$/$1/; \
       } \
       $freqs = join(",", @freqs) . ","; \
       print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n";' \
     | sort \
       > hgdpGeoCoordless.txt
     # Took ~5min.
     wc -l hgdpGeoCoordless.txt
 #657000 hgdpGeoCoordless.txt
 
     # plotFreqs.sh requires Generic Mapping Tools -- installed in
     # /hive/data/outside/GMT4.3.1 using a GMTparam.txt generated by
     # the install form on http://gmt.soest.hawaii.edu/ .
     # Add /hive/data/outside/GMT4.3.1/bin to PATH
     # Add /hive/data/outside/GMT4.3.1/man to MANPATH
     # I split plotFreqs.sh into two parts: plotMapBackground.sh, to be
     # run once, and plotFreqsOnly.sh, to be run on each rsID.
     ./plotMapBackground.sh
     foreach rsID (rs5939319 rs5982868 rs11152550)
       zcat chrX.popfreqs.ancfrq.strat.gz | head -3000 | grep $rsID > $rsID
       time plotFreqs_UCSC.sh $rsID
     end
     # plotFreqsOnly is still kind of slow.  Instead of running the ps-generating
     # commands every time, run once and use as templates for C fprintf:
     set RESETPOS = "-X-7.25i -Y-4.75i"
     echo "0.25 6.25 14 0 1 LT SNP: %s" \
     | pstext -R0/9/0/6.5 -Jx1i $RESETPOS -O -K \
     | sed -e 's/^/"/; s/$/\\n"/;' > rs.template
     echo "0.25 6.00 14 0 1 LT Ancestral Allele: %c" \
     | pstext -R0/9/0/6.5 -Jx1i -O -K -Gblue \
     | sed -e 's/^/"/; s/$/\\n"/;' > a1.template
     echo "0.25 5.75 14 0 1 LT Derived Allele: %c" \
     | pstext -R0/9/0/6.5 -Jx1i -O -K -Gorange \
     | sed -e 's/^/"/; s/$/\\n"/;' > a2.template
     # I was not up to reverse-engineering the geographic projection
     # used in psxy so I think we will have to just run that in hgc.c, oh
     # well.
 
+
+###########################################################################
+# Make population key (done 3/17/09 angie)
+    # Made script plotKey.sh that plots the map background (no lines),
+    # small circles at all of the un-tweaked locations in
+    # HGDP_HapMap.coords.txt, and population labels.
+
 ###########################################################################