4eee2fe1d4677db4a6c43ab43fbff6ea72acd318 angie Wed Nov 6 15:03:00 2019 -0800 Need to consider all alleles together when trimming extra bases from freq alleles. refs #23283 diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c index d37132b..9540b8e 100644 --- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c +++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c @@ -615,72 +615,173 @@ } } static struct spdiBed *trimSpdi(struct spdiBed *spdiB, struct lm *lm) /* Return a new spdiBed that is the minimal representation of spdiB. */ { struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart, lmCloneString(lm, spdiB->del), lmCloneString(lm, spdiB->ins), lm); int refTrimLen = 0, altTrimLen = 0; trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd, &refTrimLen, &altTrimLen); return spdiBTrim; } +static boolean spdiListTrimLeftBase(struct spdiBed *spdiList, struct lm *lm) +/* If all alleles in spdiList begin with the same base at the same pos, then modify them + * all to start one base to the right and trim the left base from all alleles. + * errAbort if the don't all have the same pos and del. */ +{ +if (spdiList && spdiList->next) + { + boolean doTrim = TRUE; + char *del = spdiList->del; + char leftBase = del[0]; + uint start = spdiList->chromStart; + struct spdiBed *spdi; + for (spdi = spdiList; spdi != NULL; spdi = spdi->next) + { + if (spdi->chromStart != start) + errAbort("spdiListTrimLeftBase: items have different starts (%s:%u, %s:%u)", + spdiList->chrom, start, spdi->chrom, spdi->chromStart); + if (differentString(spdi->del, del)) + errAbort("spdiListTrimLeftBase: items have different del (%s, %s)", del, spdi->del); + if (spdi->del[0] == 0 || spdi->ins[0] == 0 || + spdi->ins[0] != leftBase) + { + doTrim = FALSE; + break; + } + } + if (doTrim) + { + for (spdi = spdiList; spdi != NULL; spdi = spdi->next) + { + spdi->chromStart++; + // Allocate new strings because the spdi->del pointer can be shared all over. + spdi->del = lmCloneString(lm, spdi->del+1); + spdi->ins = lmCloneString(lm, spdi->ins+1); + } + } + return doTrim; + } +return FALSE; +} + +static boolean spdiListTrimRightBase(struct spdiBed *spdiList, struct lm *lm) +/* If all alleles in spdiList end with the same base, then trim the right base from all alleles. + * errAbort if not all at same pos. */ +{ +if (spdiList && spdiList->next) + { + boolean doTrim = TRUE; + char *del = spdiList->del; + char rightBase = lastChar(del); + uint start = spdiList->chromStart; + struct spdiBed *spdi; + for (spdi = spdiList; spdi != NULL; spdi = spdi->next) + { + if (spdi->chromStart != start) + errAbort("spdiListTrimRightBase: items have different starts (%s:%u, %s:%u)", + spdiList->chrom, start, spdi->chrom, spdi->chromStart); + if (differentString(spdi->del, del)) + errAbort("spdiListTrimRightBase: items have different del (%s, %s)", del, spdi->del); + if (spdi->del[0] == 0 || spdi->ins[0] == 0 || + lastChar(spdi->ins) != rightBase) + { + doTrim = FALSE; + break; + } + } + if (doTrim) + { + for (spdi = spdiList; spdi != NULL; spdi = spdi->next) + { + spdi->chromEnd--; + // Modify clones because the spdi->del pointer can be shared all over. + spdi->del = lmCloneString(lm, spdi->del); + spdi->ins = lmCloneString(lm, spdi->ins); + trimLastChar(spdi->del); + trimLastChar(spdi->ins); + } + } + return doTrim; + } +return FALSE; +} + +static boolean trimVcfSpdiList(struct spdiBed *spdiList, struct alObs *obsList, struct lm *lm) +/* SPDIs from VCF indel allele frequency submissions often still have the extra left base... and + * sometimes they have an extra right base. Detect and trim those cases. */ +{ +boolean changed = FALSE; +while (spdiListTrimLeftBase(spdiList, lm)) + changed = TRUE; +while (spdiListTrimRightBase(spdiList, lm)) + changed = TRUE; +if (changed) + { + struct spdiBed *spdi; + struct alObs *obs; + // Propagate changes to obsList + for (spdi = spdiList, obs = obsList; + spdi != NULL; + spdi = spdi->next, obs = obs->next) + { + obs->allele = spdi->ins; + } + } +return changed; +} + static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom, struct hash *ncToSeqWin, struct lm *lm) /* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do * not cover the entire repetitive range. Expand variant alleles on the sequence on which they * were originally reported; if there is an indel difference between the reporting assembly and * the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt * may be different for different assemblies). */ { int sIx; for (sIx = 0; sIx < props->freqSourceCount; sIx++) { struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx]; struct alObs *obs, *obsList = props->freqSourceObs[sIx]; + // Sanity check that spdiBList and obsList are consistent before we start tweaking: assert(slCount(spdiBList) == slCount(obsList)); - boolean changed = FALSE; + for (spdiB = spdiBList, obs = obsList; + spdiB != NULL; + spdiB = spdiB->next, obs = obs->next) + { + assert(sameString(obs->allele, spdiB->ins)); + } + boolean changed = trimVcfSpdiList(spdiBList, obsList, lm); // For each allele, see if it can be slid around. for (spdiB = spdiBList, obs = obsList; spdiB != NULL; spdiB = spdiB->next, obs = obs->next) { assert(sameString(obs->allele, spdiB->ins)); // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly. if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) || stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del)) { // First trim ref and alt to their minimal version for indelShift to work with. // There may be nothing to trim because they're usually minimal already and // stringIn catches false positive like A/TAAC, and that's fine. struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm); - if (differentString(spdiB->del, spdiB->ins) && - differentString(spdiB->del, spdiBTrim->del)) - { - // Sometimes freq alleles are neither minimal nor shiftable. If not minimal, - // make it minimal in case it was neither. Don't do this to ref allele SPDI - // because it will always trim to ""/"". It is handled in checkSpdisAfterExpansion. - changed = TRUE; - spdiB->chromStart = spdiBTrim->chromStart; - spdiB->chromEnd = spdiBTrim->chromEnd; - spdiB->del = spdiBTrim->del; - spdiB->ins = spdiBTrim->ins; - obs->allele = spdiB->ins; - } // If possible, expand the minimal representation to full SPDI range. struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom); if (seqWin == NULL) errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom); changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm); } } if (changed) checkSpdisAfterExpansion(spdiBList, obsList); } } static double mafFromSource(struct sharedProps *props, int sourceIx, char **retMajorAllele, char **retMinorAllele, struct lm *lm) /* Compare allele counts from source and return minor allele frequency (2nd-highest).