4eee2fe1d4677db4a6c43ab43fbff6ea72acd318
angie
  Wed Nov 6 15:03:00 2019 -0800
Need to consider all alleles together when trimming extra bases from freq alleles.  refs #23283

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index d37132b..9540b8e 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -615,72 +615,173 @@
     }
 }
 
 static struct spdiBed *trimSpdi(struct spdiBed *spdiB, struct lm *lm)
 /* Return a new spdiBed that is the minimal representation of spdiB. */
 {
 struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart,
                                          lmCloneString(lm, spdiB->del),
                                          lmCloneString(lm, spdiB->ins), lm);
 int refTrimLen = 0, altTrimLen = 0;
 trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd,
            &refTrimLen, &altTrimLen);
 return spdiBTrim;
 }
 
+static boolean spdiListTrimLeftBase(struct spdiBed *spdiList, struct lm *lm)
+/* If all alleles in spdiList begin with the same base at the same pos, then modify them
+ * all to start one base to the right and trim the left base from all alleles.
+ * errAbort if the don't all have the same pos and del. */
+{
+if (spdiList && spdiList->next)
+    {
+    boolean doTrim = TRUE;
+    char *del = spdiList->del;
+    char leftBase = del[0];
+    uint start = spdiList->chromStart;
+    struct spdiBed *spdi;
+    for (spdi = spdiList;  spdi != NULL;  spdi = spdi->next)
+        {
+        if (spdi->chromStart != start)
+            errAbort("spdiListTrimLeftBase: items have different starts (%s:%u, %s:%u)",
+                     spdiList->chrom, start, spdi->chrom, spdi->chromStart);
+        if (differentString(spdi->del, del))
+            errAbort("spdiListTrimLeftBase: items have different del (%s, %s)", del, spdi->del);
+        if (spdi->del[0] == 0 || spdi->ins[0] == 0 ||
+            spdi->ins[0] != leftBase)
+            {
+            doTrim = FALSE;
+            break;
+            }
+        }
+    if (doTrim)
+        {
+        for (spdi = spdiList;  spdi != NULL;  spdi = spdi->next)
+            {
+            spdi->chromStart++;
+            // Allocate new strings because the spdi->del pointer can be shared all over.
+            spdi->del = lmCloneString(lm, spdi->del+1);
+            spdi->ins = lmCloneString(lm, spdi->ins+1);
+            }
+        }
+    return doTrim;
+    }
+return FALSE;
+}
+
+static boolean spdiListTrimRightBase(struct spdiBed *spdiList, struct lm *lm)
+/* If all alleles in spdiList end with the same base, then trim the right base from all alleles.
+ * errAbort if not all at same pos. */
+{
+if (spdiList && spdiList->next)
+    {
+    boolean doTrim = TRUE;
+    char *del = spdiList->del;
+    char rightBase = lastChar(del);
+    uint start = spdiList->chromStart;
+    struct spdiBed *spdi;
+    for (spdi = spdiList;  spdi != NULL;  spdi = spdi->next)
+        {
+        if (spdi->chromStart != start)
+            errAbort("spdiListTrimRightBase: items have different starts (%s:%u, %s:%u)",
+                     spdiList->chrom, start, spdi->chrom, spdi->chromStart);
+        if (differentString(spdi->del, del))
+            errAbort("spdiListTrimRightBase: items have different del (%s, %s)", del, spdi->del);
+        if (spdi->del[0] == 0 || spdi->ins[0] == 0 ||
+            lastChar(spdi->ins) != rightBase)
+            {
+            doTrim = FALSE;
+            break;
+            }
+        }
+    if (doTrim)
+        {
+        for (spdi = spdiList;  spdi != NULL;  spdi = spdi->next)
+            {
+            spdi->chromEnd--;
+            // Modify clones because the spdi->del pointer can be shared all over.
+            spdi->del = lmCloneString(lm, spdi->del);
+            spdi->ins = lmCloneString(lm, spdi->ins);
+            trimLastChar(spdi->del);
+            trimLastChar(spdi->ins);
+            }
+        }
+    return doTrim;
+    }
+return FALSE;
+}
+
+static boolean trimVcfSpdiList(struct spdiBed *spdiList, struct alObs *obsList, struct lm *lm)
+/* SPDIs from VCF indel allele frequency submissions often still have the extra left base... and
+ * sometimes they have an extra right base.  Detect and trim those cases. */
+{
+boolean changed = FALSE;
+while (spdiListTrimLeftBase(spdiList, lm))
+    changed = TRUE;
+while (spdiListTrimRightBase(spdiList, lm))
+    changed = TRUE;
+if (changed)
+    {
+    struct spdiBed *spdi;
+    struct alObs *obs;
+    // Propagate changes to obsList
+    for (spdi = spdiList, obs = obsList;
+         spdi != NULL;
+         spdi = spdi->next, obs = obs->next)
+        {
+        obs->allele = spdi->ins;
+        }
+    }
+return changed;
+}
+
 static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom,
                               struct hash *ncToSeqWin, struct lm *lm)
 /* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do
  * not cover the entire repetitive range.  Expand variant alleles on the sequence on which they
  * were originally reported; if there is an indel difference between the reporting assembly and
  * the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt
  * may be different for different assemblies). */
 {
 int sIx;
 for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
     {
     struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx];
     struct alObs *obs, *obsList = props->freqSourceObs[sIx];
+    // Sanity check that spdiBList and obsList are consistent before we start tweaking:
     assert(slCount(spdiBList) == slCount(obsList));
-    boolean changed = FALSE;
+    for (spdiB = spdiBList, obs = obsList;
+         spdiB != NULL;
+         spdiB = spdiB->next, obs = obs->next)
+        {
+        assert(sameString(obs->allele, spdiB->ins));
+        }
+    boolean changed = trimVcfSpdiList(spdiBList, obsList, lm);
     // For each allele, see if it can be slid around.
     for (spdiB = spdiBList, obs = obsList;
          spdiB != NULL;
          spdiB = spdiB->next, obs = obs->next)
         {
         assert(sameString(obs->allele, spdiB->ins));
         // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly.
         if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) ||
             stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del))
             {
             // First trim ref and alt to their minimal version for indelShift to work with.
             // There may be nothing to trim because they're usually minimal already and
             // stringIn catches false positive like A/TAAC, and that's fine.
             struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm);
-            if (differentString(spdiB->del, spdiB->ins) &&
-                differentString(spdiB->del, spdiBTrim->del))
-                {
-                // Sometimes freq alleles are neither minimal nor shiftable.  If not minimal,
-                // make it minimal in case it was neither.  Don't do this to ref allele SPDI
-                // because it will always trim to ""/"".  It is handled in checkSpdisAfterExpansion.
-                changed = TRUE;
-                spdiB->chromStart = spdiBTrim->chromStart;
-                spdiB->chromEnd = spdiBTrim->chromEnd;
-                spdiB->del = spdiBTrim->del;
-                spdiB->ins = spdiBTrim->ins;
-                obs->allele = spdiB->ins;
-                }
             // If possible, expand the minimal representation to full SPDI range.
             struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom);
             if (seqWin == NULL)
                 errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom);
             changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm);
             }
         }
     if (changed)
         checkSpdisAfterExpansion(spdiBList, obsList);
     }
 }
 
 static double mafFromSource(struct sharedProps *props, int sourceIx,
                             char **retMajorAllele, char **retMinorAllele, struct lm *lm)
 /* Compare allele counts from source and return minor allele frequency (2nd-highest).