26d6cd9597d40b55635b1fbacc8dd64978887474 angie Mon Jun 17 10:51:53 2013 -0700 As Jonathan pointed out in note 46, Ensembl VEP always shows a/the alternate allele,even when the allele isn't used to make the functional prediction -- follow suit. refs #6152 diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index a75cd3c..ed25514 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -268,41 +268,42 @@ int exonNum, int cDnaPosition, struct lm *lm) /* check for effects in UTR of coding gene -- caller ensures it's in exon, pred is coding * and exonNum has been strand-adjusted */ { struct gpFx *gpFx = NULL; enum soTerm term = 0; struct variant *variant = allele->variant; if (variant->chromStart < pred->cdsStart && variant->chromEnd > pred->txStart) // we're in left UTR term = (*pred->strand == '-') ? _3_prime_UTR_variant : _5_prime_UTR_variant; else if (variant->chromStart < pred->txEnd && variant->chromEnd > pred->cdsEnd) // we're in right UTR term = (*pred->strand == '-') ? _5_prime_UTR_variant : _3_prime_UTR_variant; if (term != 0) { - gpFx = gpFxNew("", pred->name, term, nonCodingExon, lm); + gpFx = gpFxNew(allele->sequence, pred->name, term, nonCodingExon, lm); setNCExonVals(gpFx, exonNum, cDnaPosition); } return gpFx; } struct gpFx *gpFxChangedNoncodingExon(struct allele *allele, struct genePred *pred, int exonNum, int cDnaPosition, struct lm *lm) /* generate an effect for a variant in a non-coding transcript */ { -struct gpFx *gpFx = gpFxNew("", pred->name, non_coding_exon_variant, nonCodingExon, lm); +struct gpFx *gpFx = gpFxNew(allele->sequence, pred->name, non_coding_exon_variant, nonCodingExon, + lm); if (*pred->strand == '-') exonNum = pred->exonCount - exonNum - 1; setNCExonVals(gpFx, exonNum, cDnaPosition); return gpFx; } boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred) /* Return TRUE if variant in strand-corrected exonNum is in the region * of pred that would make it safe from Nonsense-Mediated Decay (NMD). * NMD is triggered by the presence of a stop codon that appears * before ~50bp before the end of the last exon. In other words, if * there's a stop codon with a detectable downstream splice junction, * translation is prevented -- see * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */ { @@ -402,31 +403,31 @@ int cdsPosition, int cdsBasesAdded, struct lm *lm) /* calculate effect of allele change on coding transcript */ { struct gpFx *effectsList = NULL; if (*pred->strand == '-') exonNum = pred->exonCount - exonNum - 1; // first find effects of allele in UTR, if any effectsList = gpFxCheckUtr(allele, pred, exonNum, cDnaPosition, lm); // calculate original and variant coding DNA and AA's char *oldCodingSequence, *newCodingSequence, *oldaa, *newaa; getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa, lm); getSequences(newPred, newSequence, &newCodingSequence, &newaa, lm); -// if coding sequence hasn't changed, we're done (allele == reference allele) +// if coding sequence hasn't changed (just UTR), we're done if (sameString(oldCodingSequence, newCodingSequence)) return effectsList; // allocate the effect structure - fill in soNumber and details below struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange, lm); struct codingChange *cc = &effect->details.codingChange; cc->cDnaPosition = cDnaPosition; cc->cdsPosition = cdsPosition; cc->exonNumber = exonNum; int pepPos = cdsPosition / 3; cc->pepPosition = pepPos; if (cdsBasesAdded % 3 == 0) { // Common case: substitution, same number of old/new codons/peps: @@ -455,195 +456,232 @@ cc->codonNew = lmCloneString(lm, newCodingSequence + pepPos*3); cc->aaOld = lmCloneString(lm, oldaa + pepPos); cc->aaNew = lmCloneString(lm, newaa + pepPos); } // show specific coding changes by making before-and-after subsequences: boolean safeFromNMD = isSafeFromNMD(exonNum, allele->variant, pred); setSpecificCodingSoTerm(effect, oldaa, newaa, cdsBasesAdded, safeFromNMD); slAddHead(&effectsList, effect); return effectsList; } -struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum, +struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, char *refAllele, int exonNum, int exonQStart, struct dnaSeq *transcriptSequence, struct lm *lm) /* generate an effect from a different allele in an exon */ { struct genePred *newPred = pred; int cDnaPosition = 0, basesAdded = 0, cdsPosition = 0, cdsBasesAdded = 0; char *newSequence = gpFxModifySequence(allele, pred, exonNum, exonQStart, transcriptSequence, &newPred, &cDnaPosition, &basesAdded, &cdsPosition, &cdsBasesAdded, lm); if (sameString(newSequence, transcriptSequence->dna)) - return NULL; // no change in transcript + { + struct variant *variant = allele->variant; + errAbort("gpFxInExon: %s:%d-%d, allele='%s', refAllele='%s', no change, shouldn't be here", + variant->chrom, variant->chromStart+1, variant->chromEnd, allele->sequence, + refAllele); + return NULL; // no change in transcript -- but allele should always be non-reference + } struct gpFx *effectsList; if (pred->cdsStart != pred->cdsEnd) effectsList = gpFxChangedCodingExon(allele, pred, exonNum, transcriptSequence, newSequence, newPred, cDnaPosition, basesAdded, cdsPosition, cdsBasesAdded, lm); else { effectsList = gpFxChangedNoncodingExon(allele, pred, exonNum, cDnaPosition, lm); } return effectsList; } +static boolean hasAltAllele(struct allele *alleles, char *refAllele) +/* Make sure there's something to work on here... */ +{ +while (alleles != NULL && sameString(alleles->sequence, refAllele)) + alleles = alleles->next; +return (alleles != NULL); +} + +static char *firstAltAllele(struct allele *alleles, char *refAllele) +/* Ensembl always reports an alternate allele, even if that allele is not being used + * to calculate any consequence. When allele doesn't really matter, just use the + * first alternate allele that is given. */ +{ +while (alleles != NULL && sameString(alleles->sequence, refAllele)) + alleles = alleles->next; +if (alleles == NULL) + errAbort("firstAltAllele: no alt allele in list"); +return alleles->sequence; +} + static struct gpFx *gpFxCheckExons(struct variant *variantList, struct genePred *pred, - struct dnaSeq *transcriptSequence, struct lm *lm) + char *refAllele, struct dnaSeq *transcriptSequence, + struct lm *lm) // check to see if the variant is in an exon { struct gpFx *effectsList = NULL; int exonQStart = 0; int ii; for (ii=0; ii < pred->exonCount; ii++) { struct variant *variant = variantList; for(; variant ; variant = variant->next) { // check if in an exon int end = variant->chromEnd; if (variant->chromStart == end) end++; int size; if ((size = positiveRangeIntersection(pred->exonStarts[ii], pred->exonEnds[ii], variant->chromStart, end)) > 0) { if (size != end - variant->chromStart) { // variant is not completely in exon - struct gpFx *effect = gpFxNew("", pred->name, - complex_transcript_variant, none, lm); + char *altAllele = firstAltAllele(variant->alleles, refAllele); + struct gpFx *effect = gpFxNew(altAllele, pred->name, complex_transcript_variant, + none, lm); effectsList = slCat(effectsList, effect); } struct allele *allele = variant->alleles; for(; allele ; allele = allele->next) { - effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii, exonQStart, + if (differentString(allele->sequence, refAllele)) + effectsList = slCat(effectsList, + gpFxInExon(allele, pred, refAllele, ii, exonQStart, transcriptSequence, lm)); } } } exonQStart += (pred->exonEnds[ii] - pred->exonStarts[ii]); } return effectsList; } -static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred, struct lm *lm) +static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred, + char *altAllele, struct lm *lm) // check to see if a single variant overlaps an intron (and possibly splice region) //#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome! { struct gpFx *effectsList = NULL; boolean minusStrand = (pred->strand[0] == '-'); int ii; for (ii=0; ii < pred->exonCount - 1; ii++) { int intronStart = pred->exonEnds[ii]; int intronEnd = pred->exonStarts[ii+1]; if (variant->chromEnd > intronStart && variant->chromStart < intronEnd) { enum soTerm soNumber = intron_variant; if (variant->chromEnd > intronStart && variant->chromStart < intronStart+2) // Within 2 bases of intron start(/end for '-'): soNumber = minusStrand ? splice_acceptor_variant : splice_donor_variant; if (variant->chromEnd > intronEnd-2 && variant->chromStart < intronEnd) // Within 2 bases of intron end(/start for '-'): soNumber = minusStrand ? splice_donor_variant : splice_acceptor_variant; else if ((variant->chromEnd > intronStart+3 && variant->chromStart < intronStart+8) || (variant->chromEnd > intronEnd-8 && variant->chromStart < intronEnd+3)) // Within 3 to 8 bases of intron start or end: soNumber = splice_region_variant; - struct gpFx *effects = gpFxNew("", pred->name, soNumber, intron, lm); + struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, intron, lm); effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii; slAddHead(&effectsList, effects); } else if ((variant->chromEnd > intronStart-3 && variant->chromStart < intronStart) || (variant->chromEnd > intronEnd && variant->chromStart < intronEnd+3)) { // if variant is in exon *but* within 3 bases of splice site, // it also qualifies as splice_region_variant: - struct gpFx *effects = gpFxNew("", pred->name, splice_region_variant, intron, lm); + struct gpFx *effects = gpFxNew(altAllele, pred->name, + splice_region_variant, intron, lm); effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii; slAddHead(&effectsList, effects); } } return effectsList; } static struct gpFx *gpFxCheckBackground(struct variant *variant, struct genePred *pred, - struct lm *lm) + char *refAllele, struct lm *lm) // check to see if the variant is up or downstream or in intron of the gene { struct gpFx *effectsList = NULL; +char *altAllele = firstAltAllele(variant->alleles, refAllele); for(; variant ; variant = variant->next) { // is this variant in an intron - effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred, lm)); + effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred, altAllele, lm)); // Is this variant to the left or right of transcript? enum soTerm soNumber = 0; if (variant->chromStart < pred->txStart && variant->chromEnd > pred->txStart - GPRANGE) { if (*pred->strand == '+') soNumber = upstream_gene_variant; else soNumber = downstream_gene_variant; } else if (variant->chromEnd > pred->txEnd && variant->chromStart < pred->txEnd + GPRANGE) { if (*pred->strand == '+') soNumber = downstream_gene_variant; else soNumber = upstream_gene_variant; } if (soNumber != 0) { - struct gpFx *effects = gpFxNew("", pred->name, soNumber, none, lm); + struct gpFx *effects = gpFxNew(altAllele, pred->name, soNumber, none, lm); effectsList = slCat(effectsList, effects); } } return effectsList; } static void checkVariantList(struct variant *variant) // check to see that we either have one variant (possibly with multiple // alleles) or that if we have a list of variants, they only have // one allele a piece. { if (variant->next == NULL) // just one variant return; for(; variant; variant = variant->next) if (variant->numAlleles != 1) errAbort("gpFxPredEffect needs either 1 variant, or only 1 allele in all variants"); } -struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred, +struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred, char *refAllele, struct dnaSeq *transcriptSequence, struct lm *lm) // return the predicted effect(s) of a variation list on a genePred { struct gpFx *effectsList = NULL; // make sure we can deal with the variants that are coming in checkVariantList(variant); -// check to see if SNP is up or downstream in intron -effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred, lm)); +// If only the reference allele has been observed, skip it: +if (! hasAltAllele(variant->alleles, refAllele)) + return NULL; + +// check to see if SNP is up or downstream or in intron +effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred, refAllele, lm)); // check to see if SNP is in the transcript -effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, transcriptSequence, lm)); +effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, refAllele, transcriptSequence, lm)); -//#*** sort by position +//#*** sort by transcript name? transcript start? return effectsList; }