23e3f7b1a4c78acd2198da73f4a4392290c51736
angie
Fri Apr 19 16:12:24 2013 -0700
Added detection of many new SO terms to gpFx: {5,3} UTR, splice_{donor,acceptor},non_coding_exon, and very detailed coding change terms to match Ensembl e.g.
stop_retained_variant, NMD_transcript_variant). Also tweaks to annoFormatVep to
represent insertion positions the way they do.
TODO: lots more testing! and convert gpFx to use localmem.
refs #6152
diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c
index 325148e..573c5fb 100644
--- src/hg/lib/gpFx.c
+++ src/hg/lib/gpFx.c
@@ -1,559 +1,671 @@
/* gpFx --- routines to calculate the effect of variation on a genePred */
#include "common.h"
#include "genePred.h"
#include "gpFx.h"
static char *collapseDashes(char *str)
// Trim extra hyphen characters at end of str.
{
int len = strlen(str);
if (len > 1)
{
char *p = str + len - 1;
while (*p == '-' && p > str)
*p-- = '\0';
}
return str;
}
-struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber)
+struct gpFx *gpFxNew(char *allele, char *transcript, enum soTerm soNumber,
+ enum detailType detailType)
/* Fill in the common members of gpFx; leave soTerm-specific members for caller to fill in. */
{
struct gpFx *effect;
AllocVar(effect);
effect->allele = collapseDashes(strUpper(cloneString(allele)));
-effect->so.transcript = cloneString(transcript);
-effect->so.soNumber = soNumber;
+effect->transcript = cloneString(transcript);
+effect->soNumber = soNumber;
+effect->detailType = detailType;
return effect;
}
static unsigned countDashes(char *string)
/* count the number of dashes in a string */
{
int count = 0;
while(*string)
{
if (*string == '-')
count++;
string++;
}
return count;
}
static char * mergeAllele(char *transcript, int offset, int variantWidth,
char *newAlleleSeq, int alleleLength)
/* merge a variant into an allele */
{
char *newTranscript = transcript;
if (variantWidth == alleleLength)
{
// for the moment, we're sticking the dashes into the transcripts
if (1) //noDashes(newAlleleSeq))
{
memcpy(&transcript[offset], newAlleleSeq, alleleLength);
}
#ifdef DOESNTKNOWABOUTTHEOFFSETARGUMENT
// if we decide not to put the dashes into the resulting transcripts
// this will have to be re-written to grok the offset argument
else
{
char *transcriptSource = transcript;
int ii;
for(ii=0; ii < variantWidth; ii++)
{
if (*newAlleleSeq == '-')
{
transcriptSource++;
}
else
{
*transcript++ = *newAlleleSeq++;
transcriptSource++;
}
}
while(*transcriptSource)
*transcript++ = *transcriptSource++;
*transcript = 0;
}
#endif
}
else if (variantWidth > alleleLength)
errAbort("allele not padded");
else
{
int insertionSize = alleleLength - variantWidth;
int newLength = strlen(transcript) + insertionSize;
newTranscript = needMem(newLength + 1);
char *restOfTranscript = &transcript[offset + variantWidth];
// copy over the part before the variant
memcpy(newTranscript, transcript, offset);
// copy in the new allele
memcpy(&newTranscript[offset], newAlleleSeq, alleleLength);
// copy in the new allele
memcpy(&newTranscript[offset + alleleLength], restOfTranscript,
strlen(restOfTranscript) + 1);
}
return newTranscript;
}
char *gpFxModifySequence(struct allele *allele, struct genePred *pred,
int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
struct genePred **retNewPred)
/* modify a transcript to what it'd be if the alternate allele were present;
* if transcript length changes, make retNewPred a shallow copy of pred w/tweaked coords. */
{
// clip allele to exon
struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum],
transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]);
// change transcript at variant point
int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum];
int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset;
int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart;
if (variantWidth != clipAllele->length && retNewPred != NULL)
{
// OK, now we have to make a new genePred that matches the changed sequence
int basesAdded = clipAllele->length - variantWidth;
struct genePred *newPred = CloneVar(pred);
int alEnd = clipAllele->variant->chromEnd;
if (newPred->cdsStart > alEnd)
newPred->cdsStart += basesAdded;
if (newPred->cdsEnd > alEnd)
newPred->cdsEnd += basesAdded;
newPred->exonEnds[exonNum] += basesAdded;
int ii;
for (ii = exonNum+1; ii < pred->exonCount; ii++)
{
newPred->exonStarts[ii] += basesAdded;
newPred->exonEnds[ii] += basesAdded;
}
newPred->txEnd = newPred->exonEnds[newPred->exonCount - 1];
*retNewPred = newPred;
}
char *retSequence = cloneString(transcriptSequence->dna);
char *newAlleleSeq = cloneString(clipAllele->sequence);
if (*pred->strand == '-')
{
transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq));
reverseComplement(newAlleleSeq, strlen(newAlleleSeq));
}
// make the change in the sequence
retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq,
clipAllele->length);
// clean up
freeMem(newAlleleSeq);
return retSequence;
}
-static char *getCodingSequence(struct genePred *pred, char *transcriptSequence)
-/* extract the CDS from a transcript */
+static int getCodingOffsetInTx(struct genePred *pred, char strand)
+/* Skip past UTR (portions of) exons to get offset of CDS relative to transcript start.
+ * The strand arg is used instead of pred->strand. */
+{
+int offset = 0;
+int iStart = 0, iIncr = 1;
+if (strand == '-')
{
+ // Work our way left from the last exon.
+ iStart = pred->exonCount - 1;
+ iIncr = -1;
+ }
int ii;
-
-if (*pred->strand == '-')
- reverseComplement(transcriptSequence, strlen(transcriptSequence));
-
-// trim off the 5' UTR ( or 3' if on the minus strand)
-char *ptr = transcriptSequence;
-for(ii=0; ii < pred->exonCount; ii++)
+// trim off the 5' UTR (or 3' UTR if strand is '-')
+for (ii = iStart; ii >= 0 && ii < pred->exonCount; ii += iIncr)
{
- int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
if ((pred->cdsStart >= pred->exonStarts[ii]) &&
(pred->cdsStart < pred->exonEnds[ii]))
break;
-
- ptr += exonSize;
+ int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii];
+ offset += exonSize;
}
-assert (ii != pred->exonCount);
-
+assert( !(strand == '+' && ii >= pred->exonCount) );
+assert( !(strand == '-' && ii < 0) );
// clip off part of UTR in exon that has CDS in it too
int exonOffset = pred->cdsStart - pred->exonStarts[ii];
-ptr += exonOffset;
+offset += exonOffset;
+return offset;
+}
+
+static char *getCodingSequence(struct genePred *pred, char *transcriptSequence)
+/* extract the CDS from a transcript */
+{
+if (*pred->strand == '-')
+ reverseComplement(transcriptSequence, strlen(transcriptSequence));
-char *newString = cloneString(ptr);
+// Get leftmost CDS boundary to trim off 5' (or 3' if on minus strand):
+int cdsOffset = getCodingOffsetInTx(pred, '+');
+char *newString = cloneString(transcriptSequence + cdsOffset);
// trim off 3' (or 5' if on minus strand)
newString[genePredCdsSize(pred)] = 0;
// correct for strand
if (*pred->strand == '-')
{
reverseComplement(transcriptSequence, strlen(transcriptSequence));
reverseComplement(newString, strlen(newString));
}
return newString;
}
static int firstChange(char *string1, char *string2, int *numDifferent)
/* return the position of the first difference between the two sequences */
/* if numDifferent is non-NULL, return number of characters between first
* difference, and the last difference */
{
int count = 0;
-for(; *string1 == *string2; count++, string1++, string2++)
+for(; *string1 == *string2 && (*string1); count++, string1++, string2++)
;
int firstChange = count;
int lastChange = firstChange;
if (numDifferent != NULL)
{
for (; (*string1) && (*string2); string1++, string2++, count++)
{
if (*string1 != *string2)
lastChange = count;
}
*numDifferent = lastChange - firstChange + 1;
}
return firstChange;
}
static void getSequences(struct genePred *pred, char *transcriptSequence,
- char **codingSequence, aaSeq **aaSeq)
+ char **retCodingSequence, char **retAaSeq)
/* get coding sequences for a transcript and a variant transcript */
{
-*codingSequence = getCodingSequence(pred, transcriptSequence);
-struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL);
-*aaSeq = translateSeq(codingDna, 0, FALSE); // TRUE truncates at stop, but doesn't include the Z
-char *stop = strchr((*aaSeq)->dna, 'Z');
+char *codingSequence = getCodingSequence(pred, transcriptSequence);
+struct dnaSeq *codingDna = newDnaSeq(codingSequence, strlen(codingSequence), NULL);
+// TRUE here truncates at stop, but doesn't include the Z:
+aaSeq *aaSeq = translateSeq(codingDna, 0, FALSE);
+char *stop = strchr(aaSeq->dna, 'Z');
if (stop != NULL)
{
*(stop+1) = '\0';
// If aaSeq was truncated by a stop codon, truncate codingSequence as well:
- int codingLength = 3*strlen((*aaSeq)->dna);
- (*codingSequence)[codingLength] = '\0';
+ int codingLength = 3*strlen(aaSeq->dna);
+ codingSequence[codingLength] = '\0';
}
+*retCodingSequence = codingSequence;
+*retAaSeq = cloneString(aaSeq->dna);
+freez(aaSeq);
freez(codingDna);
}
+static void setNCExonVals(struct gpFx *gpFx, int exonNum, int cDnaPosition)
+/* This gpFx is for a variant in exon of non-coding gene or UTR exon of coding gene;
+ * set details.nonCodingExon values. */
+{
+gpFx->details.nonCodingExon.exonNumber = exonNum;
+gpFx->details.nonCodingExon.cDnaPosition = cDnaPosition;
+}
+
struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred,
- int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
- char *newSequence)
-/* check for effects in UTR of coding gene */
+ int exonNum, int cDnaPosition)
+/* check for effects in UTR of coding gene -- caller ensures it's in exon, pred is coding
+ * and exonNum has been strand-adjusted */
{
-//#*** positiveRangeInt doesn't work for 0-length insertion variants
-if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
- allele->variant->chromStart, allele->variant->chromEnd))
+struct gpFx *gpFx = NULL;
+enum soTerm term = 0;
+struct variant *variant = allele->variant;
+if (variant->chromStart < pred->cdsStart && variant->chromEnd > pred->txStart)
+ // we're in left UTR
+ term = (*pred->strand == '-') ? _3_prime_UTR_variant : _5_prime_UTR_variant;
+else if (variant->chromStart < pred->txEnd && variant->chromEnd > pred->cdsEnd)
+ // we're in right UTR
+ term = (*pred->strand == '-') ? _5_prime_UTR_variant : _3_prime_UTR_variant;
+if (term != 0)
{
- // we're in 5' UTR ( or UTR intron )
- //errAbort("don't support variants in 5' UTR");
+ gpFx = gpFxNew(allele->sequence, pred->name, term, nonCodingExon);
+ setNCExonVals(gpFx, exonNum, cDnaPosition);
+ }
+return gpFx;
}
-if (positiveRangeIntersection(pred->txStart, pred->cdsStart,
- allele->variant->chromStart, allele->variant->chromEnd))
+struct gpFx *gpFxChangedNoncodingExon(struct allele *allele, struct genePred *pred,
+ int exonNum, int cDnaPosition)
+/* generate an effect for a variant in a non-coding transcript */
{
- // we're in 3' UTR
- //errAbort("don't support variants in 3' UTR");
+struct gpFx *gpFx = gpFxNew(allele->sequence, pred->name, non_coding_exon_variant, nonCodingExon);
+if (*pred->strand == '-')
+ exonNum = pred->exonCount - exonNum - 1;
+setNCExonVals(gpFx, exonNum, cDnaPosition);
+return gpFx;
+}
+
+boolean isSafeFromNMD(int exonNum, struct variant *variant, struct genePred *pred)
+/* Return TRUE if variant in strand-corrected exonNum is in the region
+ * of pred that would make it safe from Nonsense-Mediated Decay (NMD).
+ * NMD is triggered by the presence of a stop codon that appears
+ * before ~50bp before the end of the last exon. In other words, if
+ * there's a stop codon with a detectable downstream splice junction,
+ * translation is prevented -- see
+ * http://en.wikipedia.org/wiki/MRNA_surveillance#Mechanism_in_mammals */
+{
+boolean lastExonNum = pred->exonCount - 1;
+if (exonNum == lastExonNum)
+ return TRUE;
+int nextToLastExonNum = pred->exonCount - 2;
+if (exonNum == nextToLastExonNum)
+ {
+ // Is it within 50bp of 3' end of exon?
+ if (pred->strand[0] == '-')
+ return (variant->chromEnd < pred->exonStarts[1]);
+ else
+ return (variant->chromStart > pred->exonEnds[nextToLastExonNum] - 50);
}
-
-return NULL;
+return FALSE;
}
-struct gpFx *gpFxChangedNoncodingTranscript( struct allele *allele, struct genePred *pred,
- int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
- char *newSequence, struct genePred *newPred)
-/* generate an effect for a variant in a non-coding transcript */
+void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa, int numDashes,
+ int cdsChangeLength, boolean safeFromNMD)
+/* Assuming that deletions are marked with dashes in newCodingSequence,
+ * and that effect fields aside from soNumber are already populated, use the
+ * pep seqs and number of dashes to determine the appropriate SO term.
+ * Probably the majority of cases will be synonymous_variant or missense_variant,
+ * but we have to check for several other special cases esp. indels. */
+{
+struct codingChange *cc = &effect->details.codingChange;
+int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa);
+if (sameString(newAa, oldAa))
+ {
+ if (cc->pepPosition == oldAaSize-1 && cc->aaOld[0] == 'Z')
+ effect->soNumber = stop_retained_variant;
+ else
+ effect->soNumber = synonymous_variant;
+ }
+else
+ {
+ if (numDashes != 0)
+ {
+ // Got a deletion variant -- check frame:
+ //#*** if numDashes == cdsChangeLength, then we don't need numDashes...
+ if ((numDashes % 3) == 0)
+ {
+ if ((cdsChangeLength % 3) != 0)
+ errAbort("gpFx: assumption about cdsChangeLength vs. numDashes not holding; "
+ "cdsChangeLength=%d, numDashes=%d", cdsChangeLength, numDashes);
+ effect->soNumber = inframe_deletion;
+ }
+ else
+ effect->soNumber = frameshift_variant;
+ }
+ else
+ {
+ // Not a deletion; could be single-base (including early stop) or insertion
+ if (newAaSize < oldAaSize)
+ {
+ // Not a deletion but protein got smaller; must have been an early stop codon,
+ // possibly following a frameshift caused by an insertion.
+ if (cc->aaNew[0] != 'Z')
{
-return NULL;
-// errAbort("found a change in non-coding gene. we don't support non-coding genes at the moment");
+ if (newAa[newAaSize-1] != 'Z')
+ errAbort("gpFx: new protein is smaller but last base in new sequence "
+ "is '%c' not 'Z'", newAa[newAaSize-1]);
+ effect->soNumber = frameshift_variant;
+ }
+ else if (safeFromNMD)
+ effect->soNumber = stop_gained;
+ else
+ effect->soNumber = NMD_transcript_variant;
+ }
+ else if (newAaSize > oldAaSize)
+ {
+ // protein got bigger; insertion or lost stop codon
+ if (cc->aaOld[0] == 'Z')
+ effect->soNumber = stop_lost;
+ else if ((cdsChangeLength % 3) == 0)
+ effect->soNumber = inframe_insertion;
+ else
+ effect->soNumber = frameshift_variant;
+ }
+ else
+ {
+ // Single aa change
+ if (cc->pepPosition == 0 && cc->aaOld[0] == 'M')
+ effect->soNumber = initiator_codon_variant;
+ else if (cc->pepPosition == oldAaSize-1 && oldAa[oldAaSize-1] != 'Z')
+ effect->soNumber = incomplete_terminal_codon_variant;
+ else
+ effect->soNumber = missense_variant;
+ }
+ }
+ }
}
-struct gpFx *gpFxChangedCodingTranscript( struct allele *allele, struct genePred *pred,
- int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence,
+struct gpFx *gpFxChangedCodingExon(struct allele *allele, struct genePred *pred,
+ int exonNum, struct psl *transcriptPsl, char *transcriptSequence,
char *newSequence, struct genePred *newPred)
/* calculate effect of allele change on coding transcript */
{
struct gpFx *effectsList = NULL;
+if (*pred->strand == '-')
+ exonNum = pred->exonCount - exonNum - 1;
-// first find effects of allele in UTR
-effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl,
- transcriptSequence, newSequence);
+// first find effects of allele in UTR, if any
+int dnaChangeLength;
+int cDnaPosition = firstChange(newSequence, transcriptSequence, &dnaChangeLength);
+effectsList = gpFxCheckUtr(allele, pred, exonNum, cDnaPosition);
-// check to see if coding sequence is changed
// calculate original and variant coding AA's
-char *oldCodingSequence, *newCodingSequence;
-aaSeq *oldaa, *newaa;
-
-getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa);
+char *oldCodingSequence, *newCodingSequence, *oldaa, *newaa;
+getSequences(pred, transcriptSequence, &oldCodingSequence, &oldaa);
getSequences(newPred, newSequence, &newCodingSequence, &newaa);
-// if coding region hasn't changed, we're done
+// if coding sequence hasn't changed, we're done (allele == reference allele)
if (sameString(oldCodingSequence, newCodingSequence))
return effectsList;
-// start allocating the effect structure - fill in soNumber below
-struct gpFx *effects = gpFxNew(allele->sequence, pred->name, 0);
-slAddHead(&effectsList, effects);
-
-struct codingChange *cc = &effects->so.sub.codingChange;
-int dnaChangeLength;
-cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength);
+// allocate the effect structure - fill in soNumber and details below
+struct gpFx *effect = gpFxNew(allele->sequence, pred->name, coding_sequence_variant, codingChange);
+struct codingChange *cc = &effect->details.codingChange;
+cc->cDnaPosition = cDnaPosition;
int cdsChangeLength;
cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength);
-if (*pred->strand == '-')
- cc->exonNumber = pred->exonCount - exonNum;
-else
cc->exonNumber = exonNum;
// show specific coding changes by making before-and-after subsequences:
int codonPosStart = (cc->cdsPosition / 3) ;
int codonPosEnd = ((cdsChangeLength - 1 + cc->cdsPosition) / 3) ;
int numCodons = (codonPosEnd - codonPosStart + 1);
cc->codonOld = cloneStringZ(oldCodingSequence + codonPosStart*3, numCodons*3);
cc->codonNew = cloneStringZ(newCodingSequence + codonPosStart*3, numCodons*3);
cc->pepPosition = codonPosStart;
-cc->aaOld = cloneStringZ(oldaa->dna + cc->pepPosition, numCodons);
-cc->aaNew = cloneStringZ(newaa->dna + cc->pepPosition, numCodons);
+cc->aaOld = cloneStringZ(oldaa + cc->pepPosition, numCodons);
+cc->aaNew = cloneStringZ(newaa + cc->pepPosition, numCodons);
-int numDashes;
-if (sameString(newaa->dna, oldaa->dna))
- {
- // synonymous change
- effects->so.soNumber = synonymous_variant;
- }
-else
- {
- // this is assuming that deletions are marked with dashes
- // in newCodingSequence
- if ((numDashes = countDashes(newCodingSequence)) != 0)
- {
- if ((numDashes % 3) == 0)
- effects->so.soNumber = inframe_deletion;
- else
- effects->so.soNumber = frameshift_variant;
- }
- else if (newaa->size < oldaa->size)
- {
- effects->so.soNumber = stop_gained;
- }
- else if (newaa->size != oldaa->size)
- {
- effects->so.soNumber = inframe_insertion;
- }
- else
- {
- char *stopPos = strchr(newaa->dna, 'Z');
- // non-synonymous change: nonsense (early stop codon) or missense change?
- if (stopPos != NULL && stopPos < newaa->dna + newaa->size - 1)
- effects->so.soNumber = stop_gained;
- else
- effects->so.soNumber = missense_variant;
- }
- }
+boolean safeFromNMD = isSafeFromNMD(exonNum, allele->variant, pred);
+setSpecificCodingSoTerm(effect, oldaa, newaa, countDashes(newCodingSequence), cdsChangeLength,
+ safeFromNMD);
+slAddHead(&effectsList, effect);
return effectsList;
}
struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred,
int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence)
/* generate an effect from a different allele in an exon */
{
struct genePred *newPred = pred;
char *newSequence = gpFxModifySequence(allele, pred, exonNum,
transcriptPsl, transcriptSequence, &newPred);
if (sameString(newSequence, transcriptSequence->dna))
return NULL; // no change in transcript
struct gpFx *effectsList;
if (pred->cdsStart != pred->cdsEnd)
- effectsList = gpFxChangedCodingTranscript( allele, pred, exonNum, transcriptPsl,
- transcriptSequence, newSequence, newPred);
+ effectsList = gpFxChangedCodingExon(allele, pred, exonNum, transcriptPsl,
+ transcriptSequence->dna, newSequence, newPred);
else
- effectsList = gpFxChangedNoncodingTranscript( allele, pred, exonNum, transcriptPsl,
- transcriptSequence, newSequence, newPred);
+ {
+ int cDnaPosition = firstChange(newSequence, transcriptSequence->dna, NULL);
+ effectsList = gpFxChangedNoncodingExon(allele, pred, exonNum, cDnaPosition);
+ }
if (newPred != pred)
freeMem(newPred);
return effectsList;
}
static struct psl *genePredToPsl(struct genePred *pred)
/* generate a PSL alignment to a transcript from a genePred */
{
int qSize = genePredBases(pred);
#define BOGUS_CHROM_SIZE 0
struct psl *psl = pslNew(pred->name, qSize, 0, qSize,
pred->chrom, BOGUS_CHROM_SIZE,
pred->txStart, pred->txEnd,
pred->strand, pred->exonCount, 0);
psl->match = psl->qSize;
int i, qNext = 0;
for (i = 0; i < pred->exonCount; i++)
{
int exonSize = pred->exonEnds[i] - pred->exonStarts[i];
psl->qStarts[i] = qNext;
//psl->tStarts[i] = pred->exonStarts[i] + pred->txStart;
psl->tStarts[i] = pred->exonStarts[i];
psl->blockSizes[i] = exonSize;
/* notnow
if (i > 0)
{
psl->tNumInsert += 1;
psl->tBaseInsert += psl->tStarts[i] - pslTEnd(psl, i-1);
}
*/
psl->blockCount++;
qNext += psl->blockSizes[i];
}
return psl;
}
static struct gpFx *gpFxCheckExons(struct variant *variant,
struct genePred *pred, struct dnaSeq *transcriptSequence)
// check to see if the variant is in an exon
{
int ii;
struct gpFx *effectsList = NULL;
struct psl *transcriptPsl = genePredToPsl(pred);
// should copy transcriptSequence if we have more than one variant!!
for(; variant ; variant = variant->next)
{
struct allele *allele = variant->alleles;
for(; allele ; allele = allele->next)
{
for(ii=0; ii < pred->exonCount; ii++)
{
// check if in an exon
int size;
int end = variant->chromEnd;
if (variant->chromStart == end)
end++;
if ((size = positiveRangeIntersection(pred->exonStarts[ii],
pred->exonEnds[ii],
variant->chromStart, end)) > 0)
{
if (size != end - variant->chromStart)
{
struct gpFx *effect = gpFxNew(allele->sequence, pred->name,
- complex_transcript_variant);
+ complex_transcript_variant, none);
effectsList = slCat(effectsList, effect);
}
effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii,
transcriptPsl, transcriptSequence));
}
}
}
}
return effectsList;
}
-static struct gpFx *gpFxCheckIntrons(struct variant *variant,
- struct genePred *pred)
-// check to see if a single variant is in an exon or an intron
+static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred)
+// check to see if a single variant overlaps an intron (and possibly splice region)
+//#*** TODO: watch out for "introns" that are actually indels between tx seq and ref genome!
{
-int ii;
struct gpFx *effectsList = NULL;
-
+boolean minusStrand = (pred->strand[0] == '-');
+int ii;
for(ii=0; ii < pred->exonCount - 1; ii++)
{
- // check if in intron
- if (variant->chromEnd > pred->exonEnds[ii] &&
- variant->chromStart < pred->exonStarts[ii+1])
+ int intronStart = pred->exonEnds[ii];
+ int intronEnd = pred->exonStarts[ii+1];
+ if (variant->chromEnd > intronStart && variant->chromStart < intronEnd)
+ {
+ enum soTerm soNumber = intron_variant;
+ if (variant->chromEnd > intronStart && variant->chromStart < intronStart+2)
+ // Within 2 bases of intron start(/end for '-'):
+ soNumber = minusStrand ? splice_acceptor_variant : splice_donor_variant;
+ if (variant->chromEnd > intronEnd-2 && variant->chromStart < intronEnd)
+ // Within 2 bases of intron end(/start for '-'):
+ soNumber = minusStrand ? splice_donor_variant : splice_acceptor_variant;
+ else if ((variant->chromEnd > intronStart+3 && variant->chromStart < intronStart+8) ||
+ (variant->chromEnd > intronEnd-8 && variant->chromStart < intronEnd+3))
+ // Within 3 to 8 bases of intron start or end:
+ soNumber = splice_region_variant;
+ struct gpFx *effects = gpFxNew("", pred->name, soNumber, intron);
+ effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii;
+ slAddHead(&effectsList, effects);
+ }
+ else if ((variant->chromEnd > intronStart-3 && variant->chromStart < intronStart) ||
+ (variant->chromEnd > intronEnd && variant->chromStart < intronEnd+3))
{
- struct gpFx *effects = gpFxNew("", pred->name, intron_variant);
- effects->so.sub.intron.intronNumber = ii;
+ // if variant is in exon *but* within 3 bases of splice site,
+ // it also qualifies as splice_region_variant:
+ struct gpFx *effects = gpFxNew("", pred->name, splice_region_variant, intron);
+ effects->details.intron.intronNumber = minusStrand ? (pred->exonCount - ii - 1) : ii;
slAddHead(&effectsList, effects);
}
}
-
return effectsList;
}
static struct gpFx *gpFxCheckBackground(struct variant *variant,
struct genePred *pred)
// check to see if the variant is up or downstream or in intron of the gene
{
struct gpFx *effectsList = NULL;
for(; variant ; variant = variant->next)
{
// is this variant in an intron
effectsList = slCat(effectsList, gpFxCheckIntrons(variant, pred));
// Is this variant to the left or right of transcript?
enum soTerm soNumber = 0;
if (variant->chromStart < pred->txStart &&
variant->chromEnd > pred->txStart - GPRANGE)
{
if (*pred->strand == '+')
soNumber = upstream_gene_variant;
else
soNumber = downstream_gene_variant;
}
else if (variant->chromEnd > pred->txEnd &&
variant->chromStart < pred->txEnd + GPRANGE)
{
if (*pred->strand == '+')
soNumber = downstream_gene_variant;
else
soNumber = upstream_gene_variant;
}
if (soNumber != 0)
{
- struct gpFx *effects = gpFxNew("", pred->name, soNumber);
+ struct gpFx *effects = gpFxNew("", pred->name, soNumber, none);
effectsList = slCat(effectsList, effects);
}
}
return effectsList;
}
static void checkVariantList(struct variant *variant)
// check to see that we either have one variant (possibly with multiple
// alleles) or that if we have a list of variants, they only have
// one allele a piece.
{
if (variant->next == NULL) // just one variant
return;
for(; variant; variant = variant->next)
if (variant->numAlleles != 1)
errAbort("gpFxPredEffect needs either 1 variant, or only 1 allele in all variants");
}
struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred,
struct dnaSeq *transcriptSequence)
// return the predicted effect(s) of a variation list on a genePred
{
struct gpFx *effectsList = NULL;
// make sure we can deal with the variants that are coming in
checkVariantList(variant);
// check to see if SNP is up or downstream in intron
effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred));
// check to see if SNP is in the transcript
effectsList = slCat(effectsList, gpFxCheckExons(variant, pred,
transcriptSequence));
return effectsList;
}