c72b4b4412c574336911c4eb7135a9677c2773ef braney Wed Apr 25 14:07:45 2012 -0700 ongoing #6152. Added support for variants longer than 1 in CDS. diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index e8c3b82..5af7432 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -1,133 +1,162 @@ /* gpFx --- routines to calculate the effect of variation on a genePred */ #include "common.h" #include "genePred.h" #include "gpFx.h" + char *gpFxModifySequence(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* modify a transcript to what it'd be if the alternate allele were present */ { +// clip allele to exon +struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], + transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]); + // change transcript at variant point -int exonOffset = allele->variant->chromStart - transcriptPsl->tStarts[exonNum]; +int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum]; int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset; -if (allele->length != allele->variant->chromEnd - allele->variant->chromStart) +if (clipAllele->length != clipAllele->variant->chromEnd - clipAllele->variant->chromStart) errAbort("only support alleles the same length as the reference"); char *retSequence = cloneString(transcriptSequence->dna); -char *newAllele = cloneString(allele->sequence); +char *newAlleleSeq = cloneString(clipAllele->sequence); if (*pred->strand == '-') { transcriptOffset = transcriptSequence->size - (transcriptOffset + 1); - reverseComplement(newAllele, strlen(newAllele)); + reverseComplement(newAlleleSeq, strlen(newAlleleSeq)); } // make the change in the sequence -memcpy(&retSequence[transcriptOffset], newAllele, allele->length); +memcpy(&retSequence[transcriptOffset], newAlleleSeq, allele->length); // clean up -freeMem(newAllele); +freeMem(newAlleleSeq); return retSequence; } static char *getCodingSequence(struct genePred *pred, char *transcriptSequence) /* extract the CDS from a transcript */ { int ii; if (*pred->strand == '-') reverseComplement(transcriptSequence, strlen(transcriptSequence)); // trim off the 5' UTR ( or 3' if on the minus strand) char *ptr = transcriptSequence; for(ii=0; ii < pred->exonCount; ii++) { int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii]; - if (pred->cdsStart > pred->exonStarts[ii]) + if ((pred->cdsStart > pred->exonStarts[ii]) && + (pred->cdsStart < pred->exonEnds[ii])) break; ptr += exonSize; } // clip off part of UTR in exon that has CDS in it too int exonOffset = pred->cdsStart - pred->exonStarts[ii]; ptr += exonOffset; char *newString = cloneString(ptr); // trim off 3' (or 5' if on minus strand) newString[genePredCdsSize(pred)] = 0; // correct for strand if (*pred->strand == '-') { reverseComplement(transcriptSequence, strlen(transcriptSequence)); reverseComplement(newString, strlen(newString)); } return newString; } -static int firstChange(char *string1, char *string2) +static int firstChange(char *string1, char *string2, int *numDifferent) /* return the position of the first difference between the two sequences */ { int count = 0; while (*string1++ == *string2++) count++; +if (numDifferent != NULL) + { + *numDifferent = 1; + while ((*string1) && (*string2) && (*string1++ != *string2++)) + (*numDifferent)++; + } + return count; } static void getSequences(struct genePred *pred, char *transcriptSequence, char **codingSequence, aaSeq **aaSeq) /* get coding sequences for a transcript and a variant transcript */ { *codingSequence = getCodingSequence(pred, transcriptSequence); struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL); *aaSeq = translateSeq(codingDna, 0, FALSE); freez(codingDna); } static char * -codonChangeString(int codonPos, char *oldCodingSequence, char *newCodingSequence) +codonChangeString(int codonPos, int numCodons, char *oldCodingSequence, char *newCodingSequence) /* generate string that describes a codon change */ { -char buffer[100]; -safef(buffer, sizeof buffer, "%c%c%c > %c%c%c", +struct dyString *dy = newDyString(100); +int ii; + +for(ii=0; ii < numCodons; ii++) + { + dyStringPrintf(dy, "%c%c%c > %c%c%c", toupper(oldCodingSequence[codonPos + 0]), toupper(oldCodingSequence[codonPos + 1]), toupper(oldCodingSequence[codonPos + 2]), toupper(newCodingSequence[codonPos + 0]), toupper(newCodingSequence[codonPos + 1]), toupper(newCodingSequence[codonPos + 2])); + if (ii < numCodons - 1) + dyStringPrintf(dy, ","); + codonPos += 3; + } -return cloneString(buffer); +return dy->string; } static char * -aaChangeString(int pepPosition, char *oldaa, char *newaa) +aaChangeString(int pepPosition, int numaa, char *oldaa, char *newaa) /* generate string that describes an amino acid change */ { -char buffer[100]; -safef(buffer, sizeof buffer, "%c > %c", - toupper(oldaa[pepPosition]), toupper(newaa[pepPosition])); -return cloneString(buffer); +struct dyString *dy = newDyString(100); +int ii; + +for(ii=0; ii < numaa; ii++) + { + dyStringPrintf(dy, "%c > %c", + toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii])); + if (ii < numaa - 1) + dyStringPrintf(dy, ","); + } + +return dy->string; } struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, char *newSequence) /* check for effects in UTR of coding gene */ { if (positiveRangeIntersection(pred->txStart, pred->cdsStart, allele->variant->chromStart, allele->variant->chromEnd)) { // we're in 5' UTR ( or UTR intron ) errAbort("don't support variants in 5' UTR"); } if (positiveRangeIntersection(pred->txStart, pred->cdsStart, @@ -167,57 +196,63 @@ getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa); getSequences(pred, newSequence, &newCodingSequence, &newaa); // if coding region hasn't changed, we're done if (sameString(oldCodingSequence, newCodingSequence)) return effectsList; // start allocating the effect structure struct gpFx *effects; AllocVar(effects); slAddHead(&effectsList, effects); struct codingChange *cc = &effects->so.sub.codingChange; cc->transcript = cloneString(pred->name); -cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna); -cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence); +int dnaChangeLength; +cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength); +int cdsChangeLength; +cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength); if (*pred->strand == '-') cc->exonNumber = pred->exonCount - exonNum; else cc->exonNumber = exonNum; // calc codon change -int codonPos = (cc->cdsPosition / 3) * 3; -cc->codonChanges = codonChangeString( codonPos, oldCodingSequence, newCodingSequence); +int codonPosStart = (cc->cdsPosition / 3) * 3; +int codonPosEnd = ((cdsChangeLength + cc->cdsPosition) / 3) * 3; +int numCodons = (codonPosEnd - codonPosStart) / 3 + 1; +cc->codonChanges = codonChangeString( codonPosStart, numCodons, oldCodingSequence, newCodingSequence); if (sameString(newaa->dna, oldaa->dna)) { // synonymous change effects->so.soNumber = synonymous_variant; // by convention we zero out these fields since they aren't used cc->pepPosition = 0; cc->aaChanges = ""; } else { // non-synonymous change effects->so.soNumber = non_synonymous_variant; - cc->pepPosition = firstChange( newaa->dna, oldaa->dna); - cc->aaChanges = aaChangeString( cc->pepPosition, oldaa->dna, newaa->dna); + int numDifferent; + cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent); + cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, + oldaa->dna, newaa->dna); } return effectsList; } struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* generate an effect from a different allele in an exon */ { char *newSequence = gpFxModifySequence(allele, pred, exonNum, transcriptPsl, transcriptSequence); if (sameString(newSequence, transcriptSequence->dna)) return NULL; // no change in transcript