726a4c9c82592c662ae6b8c715e67b783c95beb3
angie
  Mon Nov 18 13:45:42 2019 -0800
Instead of dropping rs IDs that have incomplete frequency data, add two new ucscNotes: freqIncomplete and freqNotMapped.  refs #23283

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index e42d34c..cb46c1d 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -144,30 +144,32 @@
     struct alObs **freqSourceObs;     // Array of per-project observation lists
     char **freqSourceMajorAl;         // Array of per-project major allele
     char **freqSourceMinorAl;         // Array of per-project minor allele
     double *freqSourceMaf;            // Array of per-project minor allele frequencies
     int biggestSourceIx;              // index of freqSourceObs with highest totalCount
     boolean *freqIsRc;                // Per-project flag for sequence isRc relative to ptlp
     struct slInt *soTerms;            // Sequence Ontology functional effect terms
     struct slName *submitters;        // dbSNP submitter handles
     struct slInt *pubMedIds;          // PubMed article citations
     struct slName *clinVarAccs;       // Accessions of ClinVar annotations, i.e. ClinVar RCV*
     struct slName *clinVarSigs;       // Reported ClinVar significance
     enum bigDbSnpClass class;         // {snv, mnv, ins, del, delins, identity}
     enum soTerm maxFuncImpact;        // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic}
     int commonCount;                  // Number of projects reporting minor allele freq >= 1%
     int rareCount;                    // Number of projects reporting minor allele freq < 1%
+    boolean freqIncomplete;           // At least one project provided no alt allele (only del==ins)
+    boolean freqNotMapped;            // At least one project's report was not mapped to own asmbly
     };
 
 struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm)
 /* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm.
  * Checking validity of alleles is up to caller, but this does check whether del is a number
  * instead of sequence. */
 {
 struct spdiBed *spdi;
 lmAllocVar(lm, spdi);
 spdi->chrom = seq;
 spdi->chromStart = pos;
 spdi->del = del;
 spdi->ins = ins;
 if (isAllDigits(del))
     spdi->chromEnd = pos + atoi(del);
@@ -412,62 +414,78 @@
         lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount);
         lmAllocArray(lm, props->freqSourceObs, freqSourceCount);
         lmAllocArray(lm, props->freqIsRc, freqSourceCount);
         }
     else
         {
         props->freqSourceCount = 0;
         int allocSize = slCount(obsList);
         lmAllocArray(lm, props->freqSourceSpdis, allocSize);
         lmAllocArray(lm, props->freqSourceObs, allocSize);
         lmAllocArray(lm, props->freqIsRc, allocSize);
         }
     struct slName *unorderedSourceList = NULL;
     int maxSampleCount = 0;
     int biggestSourceIx = 0;
+    int validCount = 0;
     struct slRef *obsRef;
     for (obsRef = obsList;  obsRef != NULL;  obsRef = obsRef->next)
         {
         struct jsonElement *obsEl = obsRef->val;
         char *source = NULL;
         struct spdiBed *spdiB = NULL;
         struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm);
         int ix = freqSourceToIx(source, &unorderedSourceList, lm);
+        props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1);
+        if (props->freqIsRc[ix] >= 0)
+            {
             slAddHead(&props->freqSourceSpdis[ix], spdiB);
             slAddHead(&props->freqSourceObs[ix], obs);
-        props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1);
-        if (props->freqIsRc[ix] < 0)
-            errAbort("parseFrequencies: Unknown orientation for %s seq_id %s",
-                     rsId, spdiB->chrom);
             if (obs->totalCount > maxSampleCount)
                 {
                 maxSampleCount = obs->totalCount;
                 biggestSourceIx = ix;
                 }
+            validCount++;
             }
+        else
+            props->freqNotMapped = TRUE;
+        }
+    if (props->freqNotMapped)
+        warn("Frequency report not mapped to own assembly for %s", props->name);
+    if (validCount > 0)
+        {
         if (!freqSourceOrder)
             props->freqSourceCount = slCount(unorderedSourceList);
         int ix;
         for (ix = 0;  ix < props->freqSourceCount; ix++)
             {
             slReverse(&props->freqSourceSpdis[ix]);
             slReverse(&props->freqSourceObs[ix]);
             }
         addMissingRefAllele(props, rsId, lm);
         checkFreqSourceObs(props, rsId);
         props->biggestSourceIx = biggestSourceIx;
         }
+    else
+        {
+        props->freqSourceCount = 0;
+        props->freqSourceSpdis = NULL;
+        props->freqSourceObs = NULL;
+        props->freqIsRc = NULL;
+        }
+    }
 }
 
 static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm)
 /* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional
  * impact, discard duplicates and return as slInt list with highest impact first. */
 {
 struct slInt *intList = NULL;
 if (soTermNames)
     {
     int termCount = slCount(soTermNames);
     enum soTerm termArray[termCount];
     struct slName *term;
     int i;
     for (i = 0, term = soTermNames;  term != NULL;  i++, term = term->next)
         termArray[i] = soTermStringIdToId(term->name);
@@ -807,33 +825,44 @@
         minorAllele = majorAllele;
         minorAlleleCount = majorAlleleCount;
         majorAlleleCount = obs->obsCount;
         majorAllele = obs->allele;
         }
     else if (obs->obsCount > minorAlleleCount)
         {
         minorAllele = obs->allele;
         minorAlleleCount = obs->obsCount;
         }
     totalCount = obs->totalCount;
     }
 if (totalCount > 0)
     {
     if (majorAlleleCount < 0)
-        errAbort("no major allele observed despite nonzero totalCount...?");
+        errAbort("no allele observed despite nonzero totalCount...?");
     if (minorAlleleCount < 0)
-        errAbort("no minor allele observed... what should the minorAllele value be??");
+        {
+        // Allele and count are provided for only one allele -- inferring the other allele
+        // would be dicey, so just add a note and erase the freq data from this source and
+        // return NaN.
+        props->freqIncomplete = TRUE;
+        props->freqSourceSpdis[sourceIx] = NULL;
+        props->freqSourceObs[sourceIx] = NULL;
+        *retMajorAllele = NULL;
+        *retMinorAllele = NULL;
+        warn("Incomplete freq data from source %d for %s", sourceIx, props->name);
+        return NAN;
+        }
     }
 *retMajorAllele = lmCloneString(lm, majorAllele);
 *retMinorAllele = lmCloneString(lm, minorAllele);
 // If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test)
 return (double)minorAlleleCount / totalCount;
 }
 
 static void sortFrequencies(struct sharedProps *props, struct lm *lm)
 /* Determine major/minor allele and per-project MAF (if given).
  * If -freqOrder option was given, then fill in props->freqSource* arrays in that order.
  * Otherwise, choose the MAF from the project with the largest sample count as a fallback. */
 {
 if (props->freqSourceCount > 0)
     {
     if (freqSourceOrder != NULL)
@@ -862,30 +891,48 @@
         props->freqSourceCount = 1;
         lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount);
         lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount);
         lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount);
         props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx,
                                                 &props->freqSourceMajorAl[0],
                                                 &props->freqSourceMinorAl[0], lm);
         if (isfinite(props->freqSourceMaf[0]))
             {
             if (props->freqSourceMaf[0] >= 0.01)
                 props->commonCount++;
             else
                 props->rareCount++;
             }
         }
+    if (props->freqIncomplete)
+        {
+        // At least one freq project was erased due to incomplete data.  If it was the only
+        // project with any data, set props->freqSourceCount to 0 (no freq data).
+        int sIx;
+        boolean gotFreq = FALSE;
+        for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
+            {
+            if (isfinite(props->freqSourceMaf[sIx]))
+                gotFreq = TRUE;
+            }
+        if (!gotFreq)
+            {
+            props->freqSourceCount = 0;
+            props->freqSourceMajorAl = NULL;
+            props->freqSourceMinorAl = NULL;
+            }
+        }
     }
 }
 
 static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm)
 /* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */
 {
 struct sharedProps *props = NULL;
 lmAllocVar(lm, props);
 char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm);
 char rsName[strlen(rsNumber)+3];
 safef(rsName, sizeof rsName, "rs%s", rsNumber);
 props->name = lmCloneString(lm, rsName);
 char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm);
 props->class = varTypeToClass(varType);
 struct slRef *annotationsRef = jsonQueryElement(top, "top",
@@ -1198,30 +1245,34 @@
     dyStringAppend(dyUcscNotes, bdsClinvar ",");
 addClinVarSigs(dyUcscNotes, props);
 if (props->commonCount > 0)
     {
     dyStringAppend(dyUcscNotes, bdsCommonSome ",");
     if (props->rareCount == 0)
         dyStringAppend(dyUcscNotes, bdsCommonAll ",");
     else
         dyStringAppend(dyUcscNotes, bdsRareSome ",");
     }
 else if (props->rareCount > 0 || props->freqSourceCount == 0)
     {
     dyStringAppend(dyUcscNotes, bdsRareSome ",");
     dyStringAppend(dyUcscNotes, bdsRareAll ",");
     }
+if (props->freqIncomplete)
+    dyStringAppend(dyUcscNotes, bdsFreqIncomplete ",");
+if (props->freqNotMapped)
+    dyStringAppend(dyUcscNotes, bdsFreqNotMapped ",");
 if (isRc)
     dyStringAppend(dyUcscNotes, bdsRevStrand ",");
 if (isMultiMapper)
     dyStringAppend(dyUcscNotes, bdsMultiMap ",");
 checkRareRef(props, bds->ref, isRc, dyUcscNotes);
 if (delMismatchesGenome(bds, seqWin))
     dyStringAppend(dyUcscNotes, bdsRefMismatch ",");
 if (anyIupac(bds->ref))
     dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ",");
 int i;
 for (i = 0;  i < bds->altCount;  i++)
     if (anyIupac(bds->alts[i]))
         dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ",");
 if (hasAmbiguousFreqAllele(props))
     dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ",");