5429d29a0c4cc56674c446265b4775ce6f6de349 braney Mon Apr 23 15:02:55 2012 -0700 some clean up for #6152 diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index 1341d24..e8c3b82 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -1,200 +1,252 @@ +/* gpFx --- routines to calculate the effect of variation on a genePred */ + #include "common.h" #include "genePred.h" #include "gpFx.h" char *gpFxModifySequence(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* modify a transcript to what it'd be if the alternate allele were present */ { -//if ((pred->optFields & genePredExonFramesFld) == 0) - // genePredAddExonFrames(pred); - // change transcript at variant point int exonOffset = allele->variant->chromStart - transcriptPsl->tStarts[exonNum]; int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset; if (allele->length != allele->variant->chromEnd - allele->variant->chromStart) errAbort("only support alleles the same length as the reference"); char *retSequence = cloneString(transcriptSequence->dna); char *newAllele = cloneString(allele->sequence); if (*pred->strand == '-') { transcriptOffset = transcriptSequence->size - (transcriptOffset + 1); reverseComplement(newAllele, strlen(newAllele)); } // make the change in the sequence memcpy(&retSequence[transcriptOffset], newAllele, allele->length); // clean up freeMem(newAllele); return retSequence; } -char *getCodingSequence(struct genePred *pred, char *transcriptSequence) +static char *getCodingSequence(struct genePred *pred, char *transcriptSequence) /* extract the CDS from a transcript */ { int ii; if (*pred->strand == '-') reverseComplement(transcriptSequence, strlen(transcriptSequence)); -// trim off the 5' +// trim off the 5' UTR ( or 3' if on the minus strand) char *ptr = transcriptSequence; for(ii=0; ii < pred->exonCount; ii++) { int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii]; if (pred->cdsStart > pred->exonStarts[ii]) break; ptr += exonSize; } + +// clip off part of UTR in exon that has CDS in it too int exonOffset = pred->cdsStart - pred->exonStarts[ii]; ptr += exonOffset; char *newString = cloneString(ptr); -// trim off 3' +// trim off 3' (or 5' if on minus strand) newString[genePredCdsSize(pred)] = 0; +// correct for strand if (*pred->strand == '-') { reverseComplement(transcriptSequence, strlen(transcriptSequence)); reverseComplement(newString, strlen(newString)); } return newString; } static int firstChange(char *string1, char *string2) /* return the position of the first difference between the two sequences */ { int count = 0; while (*string1++ == *string2++) count++; return count; } -struct gpFx *gpFxInCodingExon(struct allele *allele, struct genePred *pred, - int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) -/* generate an effect from a different allele in a coding exon */ +static void getSequences(struct genePred *pred, char *transcriptSequence, + char **codingSequence, aaSeq **aaSeq) +/* get coding sequences for a transcript and a variant transcript */ +{ +*codingSequence = getCodingSequence(pred, transcriptSequence); +struct dnaSeq *codingDna = newDnaSeq(*codingSequence, strlen(*codingSequence), NULL); +*aaSeq = translateSeq(codingDna, 0, FALSE); +freez(codingDna); +} + +static char * +codonChangeString(int codonPos, char *oldCodingSequence, char *newCodingSequence) +/* generate string that describes a codon change */ +{ +char buffer[100]; +safef(buffer, sizeof buffer, "%c%c%c > %c%c%c", + toupper(oldCodingSequence[codonPos + 0]), + toupper(oldCodingSequence[codonPos + 1]), + toupper(oldCodingSequence[codonPos + 2]), + toupper(newCodingSequence[codonPos + 0]), + toupper(newCodingSequence[codonPos + 1]), + toupper(newCodingSequence[codonPos + 2])); + +return cloneString(buffer); +} + +static char * +aaChangeString(int pepPosition, char *oldaa, char *newaa) +/* generate string that describes an amino acid change */ +{ +char buffer[100]; +safef(buffer, sizeof buffer, "%c > %c", + toupper(oldaa[pepPosition]), toupper(newaa[pepPosition])); +return cloneString(buffer); +} + +struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, + char *newSequence) +/* check for effects in UTR of coding gene */ +{ +if (positiveRangeIntersection(pred->txStart, pred->cdsStart, + allele->variant->chromStart, allele->variant->chromEnd)) + { + // we're in 5' UTR ( or UTR intron ) + errAbort("don't support variants in 5' UTR"); + } + +if (positiveRangeIntersection(pred->txStart, pred->cdsStart, + allele->variant->chromStart, allele->variant->chromEnd)) + { + // we're in 3' UTR + errAbort("don't support variants in 3' UTR"); + } + +return NULL; +} + +struct gpFx *gpFxChangedNoncodingTranscript( struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, + char *newSequence) +/* generate an effect for a variant in a non-coding transcript */ +{ +return NULL; +// errAbort("found a change in non-coding gene. we don't support non-coding genes at the moment"); +} + +struct gpFx *gpFxChangedCodingTranscript( struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, + char *newSequence) +/* calculate effect of allele change on coding transcript */ { struct gpFx *effectsList = NULL; -char *newSequence = gpFxModifySequence(allele, pred, exonNum, - transcriptPsl, transcriptSequence); -if (sameString(newSequence, transcriptSequence->dna)) - return effectsList; // no change in transcript +// first find effects of allele in UTR +effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, + transcriptSequence, newSequence); // check to see if coding sequence is changed -// calculate original coding AA's -char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna); -struct dnaSeq *oldCodingDna = newDnaSeq(oldCodingSequence, - strlen(oldCodingSequence), pred->name); -aaSeq *oldaa = translateSeq(oldCodingDna, 0, FALSE); - -// calculate variant coding AA's -char *newCodingSequence = getCodingSequence(pred, newSequence); -struct dnaSeq *newCodingDna = newDnaSeq(newCodingSequence, - strlen(newCodingSequence), pred->name); -aaSeq *newaa = translateSeq(newCodingDna, 0, FALSE); +// calculate original and variant coding AA's +char *oldCodingSequence, *newCodingSequence; +aaSeq *oldaa, *newaa; + +getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa); +getSequences(pred, newSequence, &newCodingSequence, &newaa); +// if coding region hasn't changed, we're done +if (sameString(oldCodingSequence, newCodingSequence)) + return effectsList; + +// start allocating the effect structure struct gpFx *effects; AllocVar(effects); slAddHead(&effectsList, effects); -effects->so.sub.codingChange.transcript = cloneString(pred->name); -effects->so.sub.codingChange.cDnaPosition = firstChange( newSequence, - transcriptSequence->dna); -effects->so.sub.codingChange.cdsPosition = firstChange( newCodingSequence, - oldCodingSequence); + +struct codingChange *cc = &effects->so.sub.codingChange; +cc->transcript = cloneString(pred->name); +cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna); +cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence); if (*pred->strand == '-') - effects->so.sub.codingChange.exonNumber = pred->exonCount - exonNum; + cc->exonNumber = pred->exonCount - exonNum; else - effects->so.sub.codingChange.exonNumber = exonNum; - -int codonPos = (effects->so.sub.codingChange.cdsPosition / 3) * 3; - -char buffer[100]; -safef(buffer, sizeof buffer, "%c%c%c > %c%c%c", - toupper(oldCodingSequence[codonPos + 0]), - toupper(oldCodingSequence[codonPos + 1]), - toupper(oldCodingSequence[codonPos + 2]), - toupper(newCodingSequence[codonPos + 0]), - toupper(newCodingSequence[codonPos + 1]), - toupper(newCodingSequence[codonPos + 2])); + cc->exonNumber = exonNum; -effects->so.sub.codingChange.codonChanges = cloneString(buffer); +// calc codon change +int codonPos = (cc->cdsPosition / 3) * 3; +cc->codonChanges = codonChangeString( codonPos, oldCodingSequence, newCodingSequence); if (sameString(newaa->dna, oldaa->dna)) { // synonymous change effects->so.soNumber = synonymous_variant; - effects->so.sub.codingChange.pepPosition = 0; - effects->so.sub.codingChange.aaChanges = ""; + + // by convention we zero out these fields since they aren't used + cc->pepPosition = 0; + cc->aaChanges = ""; } else { // non-synonymous change effects->so.soNumber = non_synonymous_variant; - effects->so.sub.codingChange.pepPosition = firstChange( newaa->dna, - oldaa->dna); - safef(buffer, sizeof buffer, "%c > %c", - toupper(oldaa->dna[effects->so.sub.codingChange.pepPosition]), - toupper(newaa->dna[effects->so.sub.codingChange.pepPosition])); - effects->so.sub.codingChange.aaChanges = cloneString(buffer); + cc->pepPosition = firstChange( newaa->dna, oldaa->dna); + cc->aaChanges = aaChangeString( cc->pepPosition, oldaa->dna, newaa->dna); } return effectsList; } struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) +/* generate an effect from a different allele in an exon */ { -struct gpFx *effectsList = NULL; - -if (positiveRangeIntersection(pred->cdsStart, pred->cdsEnd, - allele->variant->chromStart, allele->variant->chromEnd)) - { - // we're in CDS - effectsList = slCat(effectsList, gpFxInCodingExon(allele, pred, exonNum, - transcriptPsl, transcriptSequence)); - } +char *newSequence = gpFxModifySequence(allele, pred, exonNum, + transcriptPsl, transcriptSequence); -if (positiveRangeIntersection(pred->txStart, pred->cdsStart, - allele->variant->chromStart, allele->variant->chromEnd)) - { - // we're in 5' UTR ( or UTR intron ) - } +if (sameString(newSequence, transcriptSequence->dna)) + return NULL; // no change in transcript -if (positiveRangeIntersection(pred->txStart, pred->cdsStart, - allele->variant->chromStart, allele->variant->chromEnd)) - { - // we're in 3' UTR - } +struct gpFx *effectsList; +if (pred->cdsStart != pred->cdsEnd) + effectsList = gpFxChangedCodingTranscript( allele, pred, exonNum, transcriptPsl, + transcriptSequence, newSequence); +else + effectsList = gpFxChangedNoncodingTranscript( allele, pred, exonNum, transcriptPsl, + transcriptSequence, newSequence); return effectsList; } -struct psl *genePredToPsl(struct genePred *pred) +static struct psl *genePredToPsl(struct genePred *pred) +/* generate a PSL alignment to a transcript from a genePred */ { int qSize = genePredBases(pred); #define BOGUS_CHROM_SIZE 0 struct psl *psl = pslNew(pred->name, qSize, 0, qSize, pred->chrom, BOGUS_CHROM_SIZE, pred->txStart, pred->txEnd, pred->strand, pred->exonCount, 0); psl->match = psl->qSize; int i, qNext = 0; for (i = 0; i < pred->exonCount; i++) { int exonSize = pred->exonEnds[i] - pred->exonStarts[i]; psl->qStarts[i] = qNext;