aba80d9c6d410fb7d9caf9a4b58fe1aefe2fe6bb
angie
  Tue Nov 5 14:41:46 2019 -0800
If alleles from VCF have not been minimalized but can't be expanded either, use the minimal version to get rid of extra left base.  refs #23283

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index c10f156..d37132b 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -603,31 +603,31 @@
                 spdiBRef->ins = spdiBRef->del;
                 obsRef->allele = spdiBRef->ins;
                 alreadyExpandedRefAllele = TRUE;
                 }
             else
                 errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s",
                          spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd,
                          spdiBList->del, spdiBList->ins,
                          spdiB->chrom, spdiB->chromStart, spdiB->chromEnd,
                          spdiB->del, spdiB->ins);
             }
         }
     }
 }
 
-static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm)
+static struct spdiBed *trimSpdi(struct spdiBed *spdiB, struct lm *lm)
 /* Return a new spdiBed that is the minimal representation of spdiB. */
 {
 struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart,
                                          lmCloneString(lm, spdiB->del),
                                          lmCloneString(lm, spdiB->ins), lm);
 int refTrimLen = 0, altTrimLen = 0;
 trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd,
            &refTrimLen, &altTrimLen);
 return spdiBTrim;
 }
 
 static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom,
                               struct hash *ncToSeqWin, struct lm *lm)
 /* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do
  * not cover the entire repetitive range.  Expand variant alleles on the sequence on which they
@@ -643,31 +643,44 @@
     assert(slCount(spdiBList) == slCount(obsList));
     boolean changed = FALSE;
     // For each allele, see if it can be slid around.
     for (spdiB = spdiBList, obs = obsList;
          spdiB != NULL;
          spdiB = spdiB->next, obs = obs->next)
         {
         assert(sameString(obs->allele, spdiB->ins));
         // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly.
         if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) ||
             stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del))
             {
             // First trim ref and alt to their minimal version for indelShift to work with.
             // There may be nothing to trim because they're usually minimal already and
             // stringIn catches false positive like A/TAAC, and that's fine.
-            struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
+            struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm);
+            if (differentString(spdiB->del, spdiB->ins) &&
+                differentString(spdiB->del, spdiBTrim->del))
+                {
+                // Sometimes freq alleles are neither minimal nor shiftable.  If not minimal,
+                // make it minimal in case it was neither.  Don't do this to ref allele SPDI
+                // because it will always trim to ""/"".  It is handled in checkSpdisAfterExpansion.
+                changed = TRUE;
+                spdiB->chromStart = spdiBTrim->chromStart;
+                spdiB->chromEnd = spdiBTrim->chromEnd;
+                spdiB->del = spdiBTrim->del;
+                spdiB->ins = spdiBTrim->ins;
+                obs->allele = spdiB->ins;
+                }
             // If possible, expand the minimal representation to full SPDI range.
             struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom);
             if (seqWin == NULL)
                 errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom);
             changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm);
             }
         }
     if (changed)
         checkSpdisAfterExpansion(spdiBList, obsList);
     }
 }
 
 static double mafFromSource(struct sharedProps *props, int sourceIx,
                             char **retMajorAllele, char **retMinorAllele, struct lm *lm)
 /* Compare allele counts from source and return minor allele frequency (2nd-highest).
@@ -934,31 +947,31 @@
                                 struct lm *lm)
 /* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that
  * bds's SPDI-based coords cover the whole repetitive region as expected. */
 {
 int i;
 for (i = 0;  i < bds->altCount;  i++)
     {
     char *alt = bds->alts[i];
     if (isEmpty(bds->ref) || isEmpty(alt) ||
         stringIn(bds->ref, alt) || stringIn(alt, bds->ref))
         {
         // First trim ref and alt to their minimal version for indelShift to work with.
         // There may be nothing to trim because they're usually minimal already and
         // stringIn catches false positive like A/TAAC, and that's fine.
         struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm);
-        struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
+        struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm);
 
         maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm);
         if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd)
             errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') "
                      "could be expanded to %s|%d|%d\n",
                      bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt,
                      bds->chrom, spdiB->chromStart, spdiB->chromEnd);
         }
     }
 }
 
 static void addClinVarSigs(struct dyString *dyUcscNotes, struct sharedProps *props)
 /* If clinVarSigs indicate benign, pathogenic, or both (conflicting), add ucscNote. */
 {
 boolean isBenign = FALSE, isPathogenic = FALSE;