src/hg/snp/snpLoad/hapmapPhaseIIISummary.c 1.1
1.1 2009/03/06 23:37:57 angie
Adding hapmapPhaseIIISummary, which distills hapmapSnps* and hapmapAlleles* columns into a loading file for the hapmapPhaseIIISummary table, which is used to filter hapmapSnps subtracks in hgTracks.
Index: src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
===================================================================
RCS file: src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
diff -N src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/snp/snpLoad/hapmapPhaseIIISummary.c 6 Mar 2009 23:37:57 -0000 1.1
@@ -0,0 +1,253 @@
+/* hapmapPhaseIIISummary -- read in .bed files for subtracks of
+ * hapmapSnps (rel27: phaseII+III merged data, also hapmapAlleles{Chimp,Macaque)
+ * and write out .bed for hapmapPhaseIIISummary, which hgTracks uses for
+ * filtering items in all of those subtracks. */
+
+/* This is inflexible about filenames (very rel27-specific). If we
+ * ever need flexibility, add args and options... */
+
+/* Some properties of rel27 genotype data (hapmapSnps*):
+ * - observed does not come from dbSNP. It is assayed allele1/allele2.
+ * That means that as far as the genotype files are concerned, there
+ * are no complex SNPs -- all are bi-allelic.
+ * - hapmapSnps strand is always +.
+ * - In filtering, there is no need for "none" as minor allele -- we use
+ * the monomorphicInPop flags instead. */
+
+#include "common.h"
+#include "options.h"
+#include "hash.h"
+#include "linefile.h"
+#include "hapmapSnps.h"
+#include "hapmapAllelesOrtho.h"
+#include "hapmapPhaseIIISummary.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+"hapmapPhaseIIISummary - Make hapmapPhaseIIISummary.bed from hapmap*.bed.\n"
+"usage:\n"
+" hapmapPhaseIIISummary .\n"
+/* could add options to provide support for different dbSNP versions.
+"options:\n"
+" -xxx=XXX\n"
+*/
+"\n"
+". is a placeholder arg to suppress this message. The current directory\n"
+"must have table-loading .bed files for SNPS from the 11 HapMap phaseIII\n"
+"populations, as well as orthologous alleles for chimp and macaque:\n"
+" hapmapSnps{ASW,CEU,CHB,CHG,GIH,JPT,LWK,MEX,MKK,TSI,YRI}.bed\n"
+" hapmapAlleles{Chimp,Macaque}.bed\n"
+"The data from those files is distilled into a form useful for filtering\n"
+"based on aggregate characteristics in hgTracks, and dumped out to this\n"
+"output file:\n"
+" hapmapPhaseIIISummary.bed\n"
+"\n");
+}
+
+static struct optionSpec options[] = {
+ {NULL, 0},
+};
+
+struct summary
+/* Accumulator for info from each subtrack of HapMap SNPs */
+{
+ struct summary *next;
+ struct hapmapPhaseIIISummary *finalSum;
+ char popMajorAlleles[11];
+ char popMinorAlleles[11];
+};
+
+float minorAlFreq(struct hapmapSnps *hs)
+/* Compute the minor allele frequency of hs. */
+{
+float mAF = ((2.0*hs->homoCount1 + hs->heteroCount) /
+ (2.0*(hs->homoCount1 + hs->homoCount2 + hs->heteroCount)));
+if (mAF > 0.5)
+ mAF = (1.0 - mAF);
+return mAF;
+}
+
+#define majorAllele(hs) ((hs->homoCount1 < hs->homoCount2) ? hs->allele2[0] : hs->allele1[0])
+#define minorAllele(hs) ((hs->homoCount1 < hs->homoCount2) ? hs->allele1[0] : hs->allele2[0])
+#define heterozygosity(hs) ((int)(1000.0 * hs->heteroCount / \
+ (hs->homoCount1 + hs->homoCount2 + hs->heteroCount) + 0.5))
+
+struct summary *summaryNew(struct hapmapSnps *hs, int popIndex)
+/* Return a summary to which hs's info has been added. */
+{
+struct summary *sum;
+AllocVar(sum);
+char *popCode = hapmapPhaseIIIPops[popIndex];
+char majorAl = majorAllele(hs);
+char minorAl = minorAllele(hs);
+AllocVar(sum->finalSum);
+struct hapmapPhaseIIISummary *fs = sum->finalSum;
+fs->chrom = cloneString(hs->chrom);
+fs->chromStart = hs->chromStart;
+fs->chromEnd = hs->chromEnd;
+fs->name = cloneString(hs->name);
+fs->score = heterozygosity(hs);
+fs->observed = cloneString(hs->observed);;
+fs->overallMajorAllele = majorAl;
+fs->overallMinorAllele = minorAl;
+fs->popCount = 1;
+if (sameString(popCode, "CEU") || sameString(popCode, "CHB") || sameString(popCode, "JPT") ||
+ sameString(popCode, "YRI"))
+ fs->phaseIIPopCount = 1;
+fs->foundInPop[popIndex] = TRUE;
+if (hs->heteroCount == 0 && (hs->homoCount1 == 0 || hs->homoCount2 == 0))
+ fs->monomorphicInPop[popIndex] = TRUE;
+fs->orthoCount = HAP_ORTHO_COUNT;
+fs->orthoAlleles = cloneString("NN");
+fs->orthoQuals = needMem(2 * sizeof(unsigned short));
+fs->minFreq = fs->maxFreq = minorAlFreq(hs);
+
+sum->popMajorAlleles[popIndex] = majorAl;
+sum->popMinorAlleles[popIndex] = minorAl;
+return sum;
+}
+
+void addSnpToSum(struct summary *sum, struct hapmapSnps *hs, int popIndex)
+/* Fold in hs's info into the accumulator sum. */
+{
+char *popCode = hapmapPhaseIIIPops[popIndex];
+char majorAl = majorAllele(hs);
+char minorAl = minorAllele(hs);
+struct hapmapPhaseIIISummary *fs = sum->finalSum;
+if ((!sameString(fs->chrom, hs->chrom) &&
+ !sameString(fs->chrom, "chrX") && !sameString(hs->chrom, "chrY")) || // PAR OK
+ fs->chromStart != hs->chromStart ||
+ fs->chromEnd != hs->chromEnd || !sameString(fs->name, hs->name))
+ errAbort("Incompat summary and hs?! %s|%d|%d|%s != %s|%d|%d|%s",
+ fs->chrom, fs->chromStart, fs->chromEnd, fs->name,
+ hs->chrom, hs->chromStart, hs->chromEnd, hs->name);
+fs->score += heterozygosity(hs);
+if (!sameString(fs->observed, hs->observed))
+ errAbort("Diff observed!? %s != %s's %s", fs->observed, popCode, hs->observed);
+if (fs->overallMajorAllele != majorAl)
+ fs->isMixed = TRUE;
+fs->popCount++;
+if (sameString(popCode, "CEU") || sameString(popCode, "CHB") || sameString(popCode, "JPT") ||
+ sameString(popCode, "YRI"))
+ fs->phaseIIPopCount++;
+fs->foundInPop[popIndex] = TRUE;
+if (hs->heteroCount == 0 && (hs->homoCount1 == 0 || hs->homoCount2 == 0))
+ fs->monomorphicInPop[popIndex] = TRUE;
+sum->popMajorAlleles[popIndex] = majorAl;
+sum->popMinorAlleles[popIndex] = minorAl;
+float mAF = minorAlFreq(hs);
+if (mAF < fs->minFreq)
+ fs->minFreq = mAF;
+if (mAF > fs->maxFreq)
+ fs->maxFreq = mAF;
+}
+
+void addOrthoToSum(struct summary *sum, struct hapmapAllelesOrtho *ho, int orthoIndex)
+/* Fold in orthologous allele & base quality info into the accumulator sum. */
+{
+struct hapmapPhaseIIISummary *fs = sum->finalSum;
+fs->orthoAlleles[orthoIndex] = ho->orthoAllele[0];
+fs->orthoQuals[orthoIndex] = ho->score;
+}
+
+void dumpHapmapPhaseIIISummary()
+/* Read .bed files, accumulate info, and aggregate into hapmapPhaseIIISummary file. */
+{
+int i;
+char inFile[256];
+struct lineFile *lf = NULL;
+int wordCount;
+char *words[13];
+char key[128];
+struct summary *sum, *sumList = NULL;
+struct hash *hash = hashNew(24);
+
+for (i = 0; i < HAP_PHASEIII_POPCOUNT; i++)
+ {
+ struct hapmapSnps hs;
+ safef(inFile, sizeof(inFile), "hapmapSnps%s.bed", hapmapPhaseIIIPops[i]);
+ lf = lineFileOpen(inFile, TRUE);
+ while ((wordCount = lineFileChopTab(lf, words)) > 0)
+ {
+ lineFileExpectWords(lf, 12, wordCount);
+ hapmapSnpsStaticLoad(words, &hs);
+ // Key by chrom as well as name because the pseudoautosomal regions (PAR)
+ // of chrX and chrY have independent (but identical) SNP items.
+ safef(key, sizeof(key), "%s:%s", hs.chrom, hs.name);
+ sum = hashFindVal(hash, key);
+ if (sum == NULL)
+ {
+ sum = summaryNew(&hs, i);
+ hashAdd(hash, key, sum);
+ slAddHead(&sumList, sum);
+ }
+ else
+ addSnpToSum(sum, &hs, i);
+ }
+ lineFileClose(&lf);
+ }
+for (i = 0; i < HAP_ORTHO_COUNT; i++)
+ {
+ struct hapmapAllelesOrtho ho;
+ safef(inFile, sizeof(inFile), "hapmapAlleles%s.bed", hapmapOrthoSpecies[i]);
+ lf = lineFileOpen(inFile, TRUE);
+ while ((wordCount = lineFileChopTab(lf, words)) > 0)
+ {
+ lineFileExpectWords(lf, 13, wordCount);
+ hapmapAllelesOrthoStaticLoad(words, &ho);
+ safef(key, sizeof(key), "%s:%s", ho.chrom, ho.name);
+ sum = hashFindVal(hash, key);
+ if (sum == NULL)
+ errAbort("Ortho SNP '%s' doesn't match any HapMap SNPs!", ho.name);
+ addOrthoToSum(sum, &ho, i);
+ }
+ lineFileClose(&lf);
+ }
+slReverse(&sumList);
+// That leaves it mostly sorted, but not all! Leave final sorting up to hgLoadBed.
+FILE *f = mustOpen("hapmapPhaseIIISummary.bed", "w");
+for (sum = sumList; sum != NULL; sum = sum->next)
+ {
+ struct hapmapPhaseIIISummary *fs = sum->finalSum;
+ // Convert fs->score (heterozygosity * 1000) from total into average:
+ fs->score = (int)((float)fs->score / fs->popCount + 0.5);
+ // Determine whether the overall{Major,Minor}Alleles are indeed the same
+ // as the first population encountered:
+ char firstPopMajorAl = fs->overallMajorAllele;
+ char firstPopMinorAl = fs->overallMinorAllele;
+ int firstPopYea = 0, firstPopNay = 0;
+ for (i = 0; i < HAP_PHASEIII_POPCOUNT; i++)
+ {
+ if (fs->foundInPop[i])
+ {
+ if (sum->popMajorAlleles[i] == firstPopMajorAl)
+ firstPopYea++;
+ else
+ firstPopNay++;
+ }
+ }
+ if (firstPopNay > firstPopYea)
+ {
+ fs->overallMajorAllele = firstPopMinorAl;
+ fs->overallMinorAllele = firstPopMajorAl;
+ }
+ hapmapPhaseIIISummaryTabOut(fs, f);
+ }
+carefulClose(&f);
+// All done -- no need to waste time freeing hash and sumList.
+}
+
+int main(int argc, char *argv[])
+/* Make hapmapPhaseIIISummary.bed */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+ usage();
+dumpHapmapPhaseIIISummary();
+return 0;
+}
+