2f19b4081ef0f0d93daf8f7ff1c26cf8879a4d14 braney Thu Apr 26 17:26:10 2012 -0700 more work on #6152. Now with in-frame deletions! diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index 5af7432..1689539 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -1,48 +1,94 @@ /* gpFx --- routines to calculate the effect of variation on a genePred */ #include "common.h" #include "genePred.h" #include "gpFx.h" +unsigned countDashes(char *string) +{ +int count = 0; + +while(*string) + { + if (*string == '-') + count++; + string++; + } + +return count; +} + +static void mergeAllele(char *transcript, int variantWidth, + char *newAlleleSeq, int alleleLength) +{ +if (variantWidth == alleleLength) + { + // for the moment, we're sticking the dashes into the transcripts + if (1) //noDashes(newAlleleSeq)) + memcpy(transcript, newAlleleSeq, alleleLength); + else + { + char *transcriptSource = transcript; + int ii; + for(ii=0; ii < variantWidth; ii++) + { + if (*newAlleleSeq == '-') + { + transcriptSource++; + } + else + { + *transcript++ = *newAlleleSeq++; + transcriptSource++; + } + } + while(*transcriptSource) + *transcript++ = *transcriptSource++; + *transcript = 0; + } + } +} + char *gpFxModifySequence(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* modify a transcript to what it'd be if the alternate allele were present */ { // clip allele to exon struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]); // change transcript at variant point int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum]; int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset; -if (clipAllele->length != clipAllele->variant->chromEnd - clipAllele->variant->chromStart) +int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart; +if (clipAllele->length != variantWidth) errAbort("only support alleles the same length as the reference"); char *retSequence = cloneString(transcriptSequence->dna); char *newAlleleSeq = cloneString(clipAllele->sequence); if (*pred->strand == '-') { - transcriptOffset = transcriptSequence->size - (transcriptOffset + 1); + transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq)); reverseComplement(newAlleleSeq, strlen(newAlleleSeq)); } // make the change in the sequence -memcpy(&retSequence[transcriptOffset], newAlleleSeq, allele->length); +mergeAllele( &retSequence[transcriptOffset], variantWidth, newAlleleSeq, allele->length); // clean up freeMem(newAlleleSeq); return retSequence; } static char *getCodingSequence(struct genePred *pred, char *transcriptSequence) /* extract the CDS from a transcript */ { int ii; if (*pred->strand == '-') reverseComplement(transcriptSequence, strlen(transcriptSequence)); @@ -67,44 +113,52 @@ // trim off 3' (or 5' if on minus strand) newString[genePredCdsSize(pred)] = 0; // correct for strand if (*pred->strand == '-') { reverseComplement(transcriptSequence, strlen(transcriptSequence)); reverseComplement(newString, strlen(newString)); } return newString; } static int firstChange(char *string1, char *string2, int *numDifferent) /* return the position of the first difference between the two sequences */ +/* if numDifferent is non-NULL, return number of characters between first + * difference, and the last difference */ { int count = 0; -while (*string1++ == *string2++) - count++; +for(; *string1 == *string2; count++, string1++, string2++) + ; + +int firstChange = count; +int lastChange = firstChange; if (numDifferent != NULL) { - *numDifferent = 1; - while ((*string1) && (*string2) && (*string1++ != *string2++)) - (*numDifferent)++; + for (; (*string1) && (*string2); string1++, string2++, count++) + { + if (*string1 != *string2) + lastChange = count; + } + *numDifferent = lastChange - firstChange + 1; } -return count; +return firstChange; } static void getSequences(struct genePred *pred, char *transcriptSequence, char **codingSequence, aaSeq **aaSeq) /* get coding sequences for a transcript and a variant transcript */ { *codingSequence = getCodingSequence(pred, transcriptSequence); struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL); *aaSeq = translateSeq(codingDna, 0, FALSE); freez(codingDna); } static char * codonChangeString(int codonPos, int numCodons, char *oldCodingSequence, char *newCodingSequence) /* generate string that describes a codon change */ @@ -135,30 +189,39 @@ { struct dyString *dy = newDyString(100); int ii; for(ii=0; ii < numaa; ii++) { dyStringPrintf(dy, "%c > %c", toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii])); if (ii < numaa - 1) dyStringPrintf(dy, ","); } return dy->string; } +struct gpFx *gpFxInCdsDeletion( struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, + char *newSequence) +{ +struct gpFx *effects; +AllocVar(effects); +return effects; +} + struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, char *newSequence) /* check for effects in UTR of coding gene */ { if (positiveRangeIntersection(pred->txStart, pred->cdsStart, allele->variant->chromStart, allele->variant->chromEnd)) { // we're in 5' UTR ( or UTR intron ) errAbort("don't support variants in 5' UTR"); } if (positiveRangeIntersection(pred->txStart, pred->cdsStart, allele->variant->chromStart, allele->variant->chromEnd)) { @@ -189,70 +252,81 @@ effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, transcriptSequence, newSequence); // check to see if coding sequence is changed // calculate original and variant coding AA's char *oldCodingSequence, *newCodingSequence; aaSeq *oldaa, *newaa; getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa); getSequences(pred, newSequence, &newCodingSequence, &newaa); // if coding region hasn't changed, we're done if (sameString(oldCodingSequence, newCodingSequence)) return effectsList; + // start allocating the effect structure struct gpFx *effects; AllocVar(effects); slAddHead(&effectsList, effects); struct codingChange *cc = &effects->so.sub.codingChange; cc->transcript = cloneString(pred->name); int dnaChangeLength; cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength); int cdsChangeLength; cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength); if (*pred->strand == '-') cc->exonNumber = pred->exonCount - exonNum; else cc->exonNumber = exonNum; // calc codon change -int codonPosStart = (cc->cdsPosition / 3) * 3; -int codonPosEnd = ((cdsChangeLength + cc->cdsPosition) / 3) * 3; -int numCodons = (codonPosEnd - codonPosStart) / 3 + 1; -cc->codonChanges = codonChangeString( codonPosStart, numCodons, oldCodingSequence, newCodingSequence); +int codonPosStart = (cc->cdsPosition / 3) ; +int codonPosEnd = ((cdsChangeLength - 1 + cc->cdsPosition) / 3) ; +int numCodons = (codonPosEnd - codonPosStart + 1); +cc->codonChanges = codonChangeString( codonPosStart*3, numCodons, oldCodingSequence, newCodingSequence); +// by convention we zero out these fields if they aren't used +cc->pepPosition = 0; +cc->aaChanges = ""; +int numDashes; if (sameString(newaa->dna, oldaa->dna)) { // synonymous change effects->so.soNumber = synonymous_variant; - - // by convention we zero out these fields since they aren't used - cc->pepPosition = 0; - cc->aaChanges = ""; } else { - // non-synonymous change - effects->so.soNumber = non_synonymous_variant; - int numDifferent; cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent); cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, oldaa->dna, newaa->dna); + + if ((numDashes = countDashes(newCodingSequence)) != 0) + { + if ((numDashes % 3) == 0) + effects->so.soNumber = inframe_deletion; + else + effects->so.soNumber = frameshift_variant; + } + else + { + // non-synonymous change + effects->so.soNumber = non_synonymous_variant; + } } return effectsList; } struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* generate an effect from a different allele in an exon */ { char *newSequence = gpFxModifySequence(allele, pred, exonNum, transcriptPsl, transcriptSequence); if (sameString(newSequence, transcriptSequence->dna)) return NULL; // no change in transcript