aba80d9c6d410fb7d9caf9a4b58fe1aefe2fe6bb angie Tue Nov 5 14:41:46 2019 -0800 If alleles from VCF have not been minimalized but can't be expanded either, use the minimal version to get rid of extra left base. refs #23283 diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c index c10f156..d37132b 100644 --- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c +++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c @@ -603,31 +603,31 @@ spdiBRef->ins = spdiBRef->del; obsRef->allele = spdiBRef->ins; alreadyExpandedRefAllele = TRUE; } else errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s", spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd, spdiBList->del, spdiBList->ins, spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, spdiB->del, spdiB->ins); } } } } -static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm) +static struct spdiBed *trimSpdi(struct spdiBed *spdiB, struct lm *lm) /* Return a new spdiBed that is the minimal representation of spdiB. */ { struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart, lmCloneString(lm, spdiB->del), lmCloneString(lm, spdiB->ins), lm); int refTrimLen = 0, altTrimLen = 0; trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd, &refTrimLen, &altTrimLen); return spdiBTrim; } static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom, struct hash *ncToSeqWin, struct lm *lm) /* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do * not cover the entire repetitive range. Expand variant alleles on the sequence on which they @@ -643,31 +643,44 @@ assert(slCount(spdiBList) == slCount(obsList)); boolean changed = FALSE; // For each allele, see if it can be slid around. for (spdiB = spdiBList, obs = obsList; spdiB != NULL; spdiB = spdiB->next, obs = obs->next) { assert(sameString(obs->allele, spdiB->ins)); // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly. if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) || stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del)) { // First trim ref and alt to their minimal version for indelShift to work with. // There may be nothing to trim because they're usually minimal already and // stringIn catches false positive like A/TAAC, and that's fine. - struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm); + struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm); + if (differentString(spdiB->del, spdiB->ins) && + differentString(spdiB->del, spdiBTrim->del)) + { + // Sometimes freq alleles are neither minimal nor shiftable. If not minimal, + // make it minimal in case it was neither. Don't do this to ref allele SPDI + // because it will always trim to ""/"". It is handled in checkSpdisAfterExpansion. + changed = TRUE; + spdiB->chromStart = spdiBTrim->chromStart; + spdiB->chromEnd = spdiBTrim->chromEnd; + spdiB->del = spdiBTrim->del; + spdiB->ins = spdiBTrim->ins; + obs->allele = spdiB->ins; + } // If possible, expand the minimal representation to full SPDI range. struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom); if (seqWin == NULL) errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom); changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm); } } if (changed) checkSpdisAfterExpansion(spdiBList, obsList); } } static double mafFromSource(struct sharedProps *props, int sourceIx, char **retMajorAllele, char **retMinorAllele, struct lm *lm) /* Compare allele counts from source and return minor allele frequency (2nd-highest). @@ -934,31 +947,31 @@ struct lm *lm) /* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that * bds's SPDI-based coords cover the whole repetitive region as expected. */ { int i; for (i = 0; i < bds->altCount; i++) { char *alt = bds->alts[i]; if (isEmpty(bds->ref) || isEmpty(alt) || stringIn(bds->ref, alt) || stringIn(alt, bds->ref)) { // First trim ref and alt to their minimal version for indelShift to work with. // There may be nothing to trim because they're usually minimal already and // stringIn catches false positive like A/TAAC, and that's fine. struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm); - struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm); + struct spdiBed *spdiBTrim = trimSpdi(spdiB, lm); maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm); if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd) errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') " "could be expanded to %s|%d|%d\n", bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt, bds->chrom, spdiB->chromStart, spdiB->chromEnd); } } } static void addClinVarSigs(struct dyString *dyUcscNotes, struct sharedProps *props) /* If clinVarSigs indicate benign, pathogenic, or both (conflicting), add ucscNote. */ { boolean isBenign = FALSE, isPathogenic = FALSE;