8bbed14087acb42d639fb045bc09c11394a0b82b angie Wed Jul 2 16:49:07 2014 -0700 Fixed two bugs in the way cdsBasesAdded was computed -- that causedsome strange functional assignments for "" (large deletion). Also fixed a miscall of frameshift instead of stop_gained (from insertion) brought to my attention by GoldenHelix's comparison of VEP, Annovar and SnpEff: http://blog.goldenhelix.com/?p=2486 diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index 1d92811..d4e365b 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -527,33 +527,32 @@ newAlleleSeq = ""; newAlLen = 0; } if (isRc && newAlLen > 0) { newAlleleSeq = lmCloneString(lm, newAlleleSeq); reverseComplement(newAlleleSeq, newAlLen); } int variantSizeOnCds = endInCds - startInCds; if (variantSizeOnCds < 0) errAbort("gpFx: endInCds (%d) < startInCds (%d)", endInCds, startInCds); char *newCodingSeq = mergeAllele(oldCodingSeq, startInCds, variantSizeOnCds, newAlleleSeq, newAlLen, lm); // If newCodingSequence has an early stop, truncate there: truncateAtStopCodon(newCodingSeq); -int variantSizeOnRef = allele->variant->chromEnd - allele->variant->chromStart; if (retCdsBasesAdded) - *retCdsBasesAdded = allele->length - variantSizeOnRef; + *retCdsBasesAdded = newAlLen - variantSizeOnCds; return newCodingSeq; } static void setSpecificCodingSoTerm(struct gpFx *effect, char *oldAa, char *newAa, int cdsBasesAdded) /* Assuming that deletions are marked with dashes in newCodingSequence, * and that effect fields aside from soNumber are already populated, use the * pep seqs and number of dashes to determine the appropriate SO term. * Probably the majority of cases will be synonymous_variant or missense_variant, * but we have to check for several other special cases esp. indels. */ { struct codingChange *cc = &effect->details.codingChange; int oldAaSize = strlen(oldAa), newAaSize = strlen(newAa); if (sameString(newAa, oldAa)) { @@ -571,42 +570,54 @@ { if (strchr(cc->aaOld, 'Z') && !strchr(cc->aaNew, 'Z')) effect->soNumber = stop_lost; else effect->soNumber = inframe_deletion; } else effect->soNumber = frameshift_variant; } else { // Not a deletion; could be single-base (including early stop) or insertion if (newAaSize < oldAaSize) { // Not a deletion but protein got smaller; must have been an early stop codon, - // possibly following a frameshift caused by an insertion. - if (cc->aaNew[0] != 'Z') + // possibly inserted or following a frameshift caused by an insertion. + int frame = cc->cdsPosition % 3; + int alleleLength = strlen(effect->allele); + if (! isAllNt(effect->allele, alleleLength)) + // symbolic -- may be deletion or insertion, but we can't tell. :( + alleleLength = 0; + int i, affectedCodons = (frame + alleleLength + 2) / 3; + boolean stopGain = FALSE; + for (i = 0; i < affectedCodons; i++) + if (cc->aaNew[i] == 'Z') + { + effect->soNumber = stop_gained; + stopGain = TRUE; + break; + } + if (! stopGain) { if (newAa[newAaSize-1] != 'Z') errAbort("gpFx: new protein is smaller but last base in new sequence " "is '%c' not 'Z'.\n" "oldAa (%daa): %s\nnewAa (%daa): %s\n" , newAa[newAaSize-1], oldAaSize, oldAa, newAaSize, newAa); effect->soNumber = frameshift_variant; } - else - effect->soNumber = stop_gained; } else if (newAaSize > oldAaSize) { // protein got bigger; insertion or lost stop codon if (cc->aaOld[0] == 'Z') effect->soNumber = stop_lost; else if ((cdsBasesAdded % 3) == 0) effect->soNumber = inframe_insertion; else effect->soNumber = frameshift_variant; } else { // Single aa change if (cc->pepPosition == 0 && cc->aaOld[0] == 'M')