src/hg/makeDb/doc/hgdpGeo.txt 1.2
1.2 2009/03/30 22:15:32 angie
Made a population key image (thanks b0b for suggestion).
Index: src/hg/makeDb/doc/hgdpGeo.txt
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/makeDb/doc/hgdpGeo.txt,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/makeDb/doc/hgdpGeo.txt 10 Feb 2009 17:59:18 -0000 1.1
+++ src/hg/makeDb/doc/hgdpGeo.txt 30 Mar 2009 22:15:32 -0000 1.2
@@ -1,127 +1,134 @@
# for emacs: -*- mode: sh; -*-
# SNP geographic data from the Human Genome Diversity Project --
# ancestral allele frequencies for 53 populations worldwide.
# I'm adapting scripts by John Novembre, jnovembre at ucla,
# and Joe Pickrell, pickrell at uchicago.
###########################################################################
# (2/3/09 - 2/5/09 angie)
mkdir /hive/data/outside/hgdpGeo
cd /hive/data/outside/hgdpGeo
# Saved files emailed from Joe Pickrell, pickrell at uchicago:
# HGDP_HapMap.coords.txt, HGDP_HapMap.coords_adj.txt, HGDP_supp.pdf,
# illumina_snps.gz and plotFreqs.sh. Download allele frequency files:
set i = 1
while ($i <= 22)
wget --timestamping \
http://hgdp.uchicago.edu/data/Alfreqs/chr$i.popfreqs.ancfrq.strat.gz
set i = `expr $i + 1`
end
wget --timestamping -O chrX.popfreqs.ancfrq.strat.gz \
http://hgdp.uchicago.edu/data/Alfreqs/chr23.popfreqs.ancfreq.strat.gz
# Broken down by chroms... confirm that rsID sections are unique:
zcat chr*.gz | cut -f 2 | uniq > rsIDs.lst
wc -l rsIDs.lst
#657000 rsIDs.lst
sort -u rsIDs.lst | wc -l
#657000
# The rsIDs in illumina_snps.gz were directly genotyped -- other rsIDs
# were imputed from HapMap data (not as high-confidence). How many
# of the SNPs were imputed?
zcat illumina_snps.gz | grep -vFwf - rsIDs.lst | wc -l
#0
# So there are no imputed SNPs in the chr*.strat files! -- confirmed w/Joe.
# How many populations, on average, does each rsID have data for?
zcat chr*.gz | wc -l
#34821000
calc 34821000 / 657000
#34821000 / 657000 = 53.000000
# Wow, that is an awfully round number. Of the 59 populations for
# which we have coords, how many appear in the frequency files?
zcat chr*.gz | cut -f 3 | sort -u > popsWithData.txt
wc -l popsWithData.txt
#53 popsWithData.txt
# OK, we can assume exactly 53. Which populations don't we have data for?
grep -vFwf popsWithData.txt HGDP_HapMap.coords_adj.txt
#Population Latitude[DegreesNorth] Longitude[DegreesEast]
#Herero -22 19
#Ovambo -19 18
#Pedi -29 30
#Sotho -29 29
#Tswana -28 24
#Zulu -28 31
# Generate a perl hash of populations w/data => alphabetical index
# for storing frequencies in an array of floats:
grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
| perl -wpe 'chomp; @w=split; $index = ($. - 1); \
$_ = "\t \"$w[0]\" => $index,\n"; \
s/\t \"Adygei/\%pops = (\"Adygei/; s/(Yoruba.*),/$1 );/;' \
> popHash.pl
# Generate a C array of structs that encode population name, lat and long
# for plotting:
grep -Fwf popsWithData.txt HGDP_HapMap.coords_adj.txt | sort \
| perl -wpe 'chomp; @w=split; \
$_ = " { \"$w[0]\", $w[1], $w[2] },\n"; \
s/( { \"Adygei)/struct hgdpPopInfo pops[] =\n{\n$1/; s/(Yoruba.*),/$1\n};/;' \
> popInfo.c
# Now distill the frequency data into a tab-sep text file sorted by rsId.
# This can be joined with snp12X.txt (sorted by rsId) to prepend chrom,
# chromStart and chromEnd to make it a track table.
zcat chr*.gz \
| perl -we 'use vars "%pops"; eval `cat popHash.pl`; \
while (<>) { \
(undef, $rsId, $pop, $anc, $der, $freq, undef, undef) = split; \
if (! defined $lastRs || $lastRs ne $rsId) { \
if (defined $lastRs) { \
die "Wrong number of freqs for $rsId: " . scalar(@freqs) if (@freqs != 53); \
$freqs = join(",", @freqs) . ","; \
print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n"; \
} \
($lastRs, $lastAnc, $lastDer, @freqs) = ($rsId, $anc, $der, ()); \
} \
$index = $pops{$pop}; \
die "No index for pop $pop" unless (defined $index); \
$freqs[$index] = sprintf("%.4f", $freq); \
$freqs[$index] =~ s/([0-9])0+$/$1/; \
} \
$freqs = join(",", @freqs) . ","; \
print "$lastRs\t$lastAnc\t$lastDer\t$freqs\n";' \
| sort \
> hgdpGeoCoordless.txt
# Took ~5min.
wc -l hgdpGeoCoordless.txt
#657000 hgdpGeoCoordless.txt
# plotFreqs.sh requires Generic Mapping Tools -- installed in
# /hive/data/outside/GMT4.3.1 using a GMTparam.txt generated by
# the install form on http://gmt.soest.hawaii.edu/ .
# Add /hive/data/outside/GMT4.3.1/bin to PATH
# Add /hive/data/outside/GMT4.3.1/man to MANPATH
# I split plotFreqs.sh into two parts: plotMapBackground.sh, to be
# run once, and plotFreqsOnly.sh, to be run on each rsID.
./plotMapBackground.sh
foreach rsID (rs5939319 rs5982868 rs11152550)
zcat chrX.popfreqs.ancfrq.strat.gz | head -3000 | grep $rsID > $rsID
time plotFreqs_UCSC.sh $rsID
end
# plotFreqsOnly is still kind of slow. Instead of running the ps-generating
# commands every time, run once and use as templates for C fprintf:
set RESETPOS = "-X-7.25i -Y-4.75i"
echo "0.25 6.25 14 0 1 LT SNP: %s" \
| pstext -R0/9/0/6.5 -Jx1i $RESETPOS -O -K \
| sed -e 's/^/"/; s/$/\\n"/;' > rs.template
echo "0.25 6.00 14 0 1 LT Ancestral Allele: %c" \
| pstext -R0/9/0/6.5 -Jx1i -O -K -Gblue \
| sed -e 's/^/"/; s/$/\\n"/;' > a1.template
echo "0.25 5.75 14 0 1 LT Derived Allele: %c" \
| pstext -R0/9/0/6.5 -Jx1i -O -K -Gorange \
| sed -e 's/^/"/; s/$/\\n"/;' > a2.template
# I was not up to reverse-engineering the geographic projection
# used in psxy so I think we will have to just run that in hgc.c, oh
# well.
+
+###########################################################################
+# Make population key (done 3/17/09 angie)
+ # Made script plotKey.sh that plots the map background (no lines),
+ # small circles at all of the un-tweaked locations in
+ # HGDP_HapMap.coords.txt, and population labels.
+
###########################################################################