4898794edd81be5285ea6e544acbedeaeb31bf78
max
  Tue Nov 23 08:10:57 2021 -0800
Fixing pointers to README file for license in all source code files. refs #27614

diff --git src/hg/snp/snpLoad/hapmapPhaseIIISummary.c src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
index 847501f..e97e0f8 100644
--- src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
+++ src/hg/snp/snpLoad/hapmapPhaseIIISummary.c
@@ -1,255 +1,255 @@
 /* hapmapPhaseIIISummary -- read in .bed files for subtracks of
  * hapmapSnps (rel27: phaseII+III merged data, also hapmapAlleles{Chimp,Macaque)
  * and write out .bed for hapmapPhaseIIISummary, which hgTracks uses for
  * filtering items in all of those subtracks. */
 
 /* Copyright (C) 2011 The Regents of the University of California 
- * See README in this or parent directory for licensing information. */
+ * See kent/LICENSE or http://genome.ucsc.edu/license/ for licensing information. */
 
 /* This is inflexible about filenames (very rel27-specific).  If we
  * ever need flexibility, add args and options... */
 
 /* Some properties of rel27 genotype data (hapmapSnps*):
  * - observed does not come from dbSNP.  It is assayed allele1/allele2.
  *   That means that as far as the genotype files are concerned, there 
  *   are no complex SNPs -- all are bi-allelic.
  * - hapmapSnps strand is always +.
  * - In filtering, there is no need for "none" as minor allele -- we use 
  *   the monomorphicInPop flags instead. */
 
 #include "common.h"
 #include "options.h"
 #include "hash.h"
 #include "linefile.h"
 #include "hapmapSnps.h"
 #include "hapmapAllelesOrtho.h"
 #include "hapmapPhaseIIISummary.h"
 
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
 "hapmapPhaseIIISummary - Make hapmapPhaseIIISummary.bed from hapmap*.bed.\n"
 "usage:\n"
 "   hapmapPhaseIIISummary .\n"
 /* could add options to provide support for different dbSNP versions.
 "options:\n"
 "   -xxx=XXX\n"
 */
 "\n"
 ". is a placeholder arg to suppress this message.  The current directory\n"
 "must have table-loading .bed files for SNPS from the 11 HapMap phaseIII\n"
 "populations, as well as orthologous alleles for chimp and macaque:\n"
 "    hapmapSnps{ASW,CEU,CHB,CHG,GIH,JPT,LWK,MEX,MKK,TSI,YRI}.bed\n"
 "    hapmapAlleles{Chimp,Macaque}.bed\n"
 "The data from those files is distilled into a form useful for filtering\n"
 "based on aggregate characteristics in hgTracks, and dumped out to this\n"
 "output file:\n"
 "    hapmapPhaseIIISummary.bed\n"
 "\n");
 }
 
 static struct optionSpec options[] = {
    {NULL, 0},
 };
 
 struct summary
 /* Accumulator for info from each subtrack of HapMap SNPs */
 {
     struct summary *next;
     struct hapmapPhaseIIISummary *finalSum;
     char popMajorAlleles[11];
     char popMinorAlleles[11];
 };
 
 float minorAlFreq(struct hapmapSnps *hs)
 /* Compute the minor allele frequency of hs. */
 {
 float mAF = ((2.0*hs->homoCount1 + hs->heteroCount) /
 	     (2.0*(hs->homoCount1 + hs->homoCount2 + hs->heteroCount)));
 if (mAF > 0.5)
     mAF = (1.0 - mAF);
 return mAF;
 }
 
 #define majorAllele(hs) ((hs->homoCount1 < hs->homoCount2) ? hs->allele2[0] : hs->allele1[0])
 #define minorAllele(hs) ((hs->homoCount1 < hs->homoCount2) ? hs->allele1[0] : hs->allele2[0])
 #define heterozygosity(hs) ((int)(1000.0 * hs->heteroCount / \
                                   (hs->homoCount1 + hs->homoCount2 + hs->heteroCount) + 0.5))
 
 struct summary *summaryNew(struct hapmapSnps *hs, int popIndex)
 /* Return a summary to which hs's info has been added. */
 {
 struct summary *sum;
 AllocVar(sum);
 char *popCode = hapmapPhaseIIIPops[popIndex];
 char majorAl = majorAllele(hs);
 char minorAl = minorAllele(hs);
 AllocVar(sum->finalSum);
 struct hapmapPhaseIIISummary *fs = sum->finalSum;
 fs->chrom = cloneString(hs->chrom);
 fs->chromStart = hs->chromStart;
 fs->chromEnd = hs->chromEnd;
 fs->name = cloneString(hs->name);
 fs->score = heterozygosity(hs);
 fs->observed = cloneString(hs->observed);;
 fs->overallMajorAllele = majorAl;
 fs->overallMinorAllele = minorAl;
 fs->popCount = 1;
 if (sameString(popCode, "CEU") || sameString(popCode, "CHB") || sameString(popCode, "JPT") ||
     sameString(popCode, "YRI"))
     fs->phaseIIPopCount = 1;
 fs->foundInPop[popIndex] = TRUE;
 if (hs->heteroCount == 0 && (hs->homoCount1 == 0 || hs->homoCount2 == 0))
     fs->monomorphicInPop[popIndex] = TRUE;
 fs->orthoCount = HAP_ORTHO_COUNT;
 fs->orthoAlleles = cloneString("NN");
 fs->orthoQuals = needMem(2 * sizeof(unsigned short));
 fs->minFreq = fs->maxFreq = minorAlFreq(hs);
 
 sum->popMajorAlleles[popIndex] = majorAl;
 sum->popMinorAlleles[popIndex] = minorAl;
 return sum;
 }
 
 void addSnpToSum(struct summary *sum, struct hapmapSnps *hs, int popIndex)
 /* Fold in hs's info into the accumulator sum. */
 {
 char *popCode = hapmapPhaseIIIPops[popIndex];
 char majorAl = majorAllele(hs);
 char minorAl = minorAllele(hs);
 struct hapmapPhaseIIISummary *fs = sum->finalSum;
 if ((!sameString(fs->chrom, hs->chrom) &&
      !sameString(fs->chrom, "chrX") && !sameString(hs->chrom, "chrY")) || // PAR OK
     fs->chromStart != hs->chromStart ||
     fs->chromEnd != hs->chromEnd || !sameString(fs->name, hs->name))
     errAbort("Incompat summary and hs?! %s|%d|%d|%s != %s|%d|%d|%s",
 	     fs->chrom, fs->chromStart, fs->chromEnd, fs->name,
 	     hs->chrom, hs->chromStart, hs->chromEnd, hs->name);
 fs->score += heterozygosity(hs);
 if (!sameString(fs->observed, hs->observed))
     errAbort("Diff observed!?  %s != %s's %s", fs->observed, popCode, hs->observed);
 if (fs->overallMajorAllele != majorAl)
     fs->isMixed = TRUE;
 fs->popCount++;
 if (sameString(popCode, "CEU") || sameString(popCode, "CHB") || sameString(popCode, "JPT") ||
     sameString(popCode, "YRI"))
     fs->phaseIIPopCount++;
 fs->foundInPop[popIndex] = TRUE;
 if (hs->heteroCount == 0 && (hs->homoCount1 == 0 || hs->homoCount2 == 0))
     fs->monomorphicInPop[popIndex] = TRUE;
 sum->popMajorAlleles[popIndex] = majorAl;
 sum->popMinorAlleles[popIndex] = minorAl;
 float mAF = minorAlFreq(hs);
 if (mAF < fs->minFreq)
     fs->minFreq = mAF;
 if (mAF > fs->maxFreq)
     fs->maxFreq = mAF;
 }
 
 void addOrthoToSum(struct summary *sum, struct hapmapAllelesOrtho *ho, int orthoIndex)
 /* Fold in orthologous allele & base quality info into the accumulator sum. */
 {
 struct hapmapPhaseIIISummary *fs = sum->finalSum;
 fs->orthoAlleles[orthoIndex] = ho->orthoAllele[0];
 fs->orthoQuals[orthoIndex] = ho->score;
 }
 
 void dumpHapmapPhaseIIISummary()
 /* Read .bed files, accumulate info, and aggregate into hapmapPhaseIIISummary file. */
 {
 int i;
 char inFile[256];
 struct lineFile *lf = NULL;
 int wordCount;
 char *words[13];
 char key[128];
 struct summary *sum, *sumList = NULL;
 struct hash *hash = hashNew(24);
 
 for (i = 0;  i < HAP_PHASEIII_POPCOUNT;  i++)
     {
     struct hapmapSnps hs;
     safef(inFile, sizeof(inFile), "hapmapSnps%s.bed", hapmapPhaseIIIPops[i]);
     lf = lineFileOpen(inFile, TRUE);
     while ((wordCount = lineFileChopTab(lf, words)) > 0)
 	{
 	lineFileExpectWords(lf, 12, wordCount);
 	hapmapSnpsStaticLoad(words, &hs);
 	// Key by chrom as well as name because the pseudoautosomal regions (PAR)
 	// of chrX and chrY have independent (but identical) SNP items.
 	safef(key, sizeof(key), "%s:%s", hs.chrom, hs.name);
 	sum = hashFindVal(hash, key);
 	if (sum == NULL)
 	    {
 	    sum = summaryNew(&hs, i);
 	    hashAdd(hash, key, sum);
 	    slAddHead(&sumList, sum);
 	    }
 	else
 	    addSnpToSum(sum, &hs, i);
 	}
     lineFileClose(&lf);
     }
 for (i = 0;  i < HAP_ORTHO_COUNT;  i++)
     {
     struct hapmapAllelesOrtho ho;
     safef(inFile, sizeof(inFile), "hapmapAlleles%s.bed", hapmapOrthoSpecies[i]);
     lf = lineFileOpen(inFile, TRUE);
     while ((wordCount = lineFileChopTab(lf, words)) > 0)
 	{
 	lineFileExpectWords(lf, 13, wordCount);
 	hapmapAllelesOrthoStaticLoad(words, &ho);
 	safef(key, sizeof(key), "%s:%s", ho.chrom, ho.name);
 	sum = hashFindVal(hash, key);
 	if (sum == NULL)
 	    errAbort("Ortho SNP '%s' doesn't match any HapMap SNPs!", ho.name);
 	addOrthoToSum(sum, &ho, i);
 	}
     lineFileClose(&lf);
     }
 slReverse(&sumList);
 // That leaves it mostly sorted, but not all!  Leave final sorting up to hgLoadBed.
 FILE *f = mustOpen("hapmapPhaseIIISummary.bed", "w");
 for (sum = sumList;  sum != NULL;  sum = sum->next)
     {
     struct hapmapPhaseIIISummary *fs = sum->finalSum;
     // Convert fs->score (heterozygosity * 1000) from total into average:
     fs->score = (int)((float)fs->score / fs->popCount + 0.5);
     // Determine whether the overall{Major,Minor}Alleles are indeed the same
     // as the first population encountered:
     char firstPopMajorAl = fs->overallMajorAllele;
     char firstPopMinorAl = fs->overallMinorAllele;
     int firstPopYea = 0, firstPopNay = 0;
     for (i = 0;  i < HAP_PHASEIII_POPCOUNT;  i++)
 	{
 	if (fs->foundInPop[i])
 	    {
 	    if (sum->popMajorAlleles[i] == firstPopMajorAl)
 		firstPopYea++;
 	    else
 		firstPopNay++;
 	    }
 	}
     if (firstPopNay > firstPopYea)
 	{
 	fs->overallMajorAllele = firstPopMinorAl;
 	fs->overallMinorAllele = firstPopMajorAl;
 	}
     hapmapPhaseIIISummaryTabOut(fs, f);
     }
 carefulClose(&f);
 // All done -- no need to waste time freeing hash and sumList.
 }
 
 int main(int argc, char *argv[])
 /* Make hapmapPhaseIIISummary.bed */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 dumpHapmapPhaseIIISummary();
 return 0;
 }