3ec9dec14cfbbe94f51eb94dd0cc861cad375f97 braney Tue Apr 17 18:58:18 2012 -0700 ongoing work on #6152. Now with non-synonymous changes! (well, at least with respect to the genomic DNA) diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index dc93d33..c935389 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -1,77 +1,223 @@ #include "common.h" #include "genePred.h" #include "gpFx.h" -struct gpFx *gpFxInCodingExon(struct variant *variant, struct genePred *pred, - int exonNum, char **returnTranscript, char **returnCoding) +char *gpFxModifySequence(struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) { //if ((pred->optFields & genePredExonFramesFld) == 0) // genePredAddExonFrames(pred); -return NULL; +// change transcript at variant point +int exonOffset = allele->variant->chromStart - transcriptPsl->tStarts[exonNum]; +int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset; + +if (allele->length != allele->variant->chromEnd - allele->variant->chromStart) + errAbort("only support alleles the same length as the reference"); + +char *retSequence = cloneString(transcriptSequence->dna); +memcpy(&transcriptSequence->dna[transcriptOffset], allele->sequence, + allele->length); + +return retSequence; +} + +char *getCodingSequence(struct genePred *pred, char *transcriptSequence) +{ +int ii; + +// trim off the 5' +char *ptr = transcriptSequence; +for(ii=0; ii < pred->exonCount; ii++) + { + int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii]; + if (pred->cdsStart > pred->exonStarts[ii]) + break; + + ptr += exonSize; + } +int exonOffset = pred->cdsStart - pred->exonStarts[ii]; +ptr += exonOffset; + +char *newString = cloneString(ptr); + +// trim off 3' +newString[genePredCdsSize(pred)] = 0; + +return newString; } -struct gpFx *gpFxInExon(struct variant *variant, struct genePred *pred, - int exonNum, char **returnTranscript, char **returnCoding) +int firstChange(char *string1, char *string2) +/* return the position of the first difference between the two sequences */ +{ +int count = 0; + +while (*string1++ == *string2++) + count++; + +return count; +} + +struct gpFx *gpFxInCodingExon(struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) +/* generate an effect from a different allele in a coding exon */ +{ +struct gpFx *effectsList = NULL; +char *newSequence = gpFxModifySequence(allele, pred, exonNum, + transcriptPsl, transcriptSequence); + +if (sameString(newSequence, transcriptSequence->dna)) + return effectsList; // no change in transcript + +// check to see if coding sequence is changed +// calculate original coding AA's +char *oldCodingSequence = getCodingSequence(pred, transcriptSequence->dna); +struct dnaSeq *oldCodingDna = newDnaSeq(oldCodingSequence, + strlen(oldCodingSequence), pred->name); +aaSeq *oldaa = translateSeq(oldCodingDna, 0, FALSE); + +// calculate variant coding AA's +char *newCodingSequence = getCodingSequence(pred, newSequence); +struct dnaSeq *newCodingDna = newDnaSeq(newCodingSequence, + strlen(newCodingSequence), pred->name); +aaSeq *newaa = translateSeq(newCodingDna, 0, FALSE); + +//transcript ID, exon number(s), cDNA position, CDS position, peptide position, alternate amino acids, alternate codons" 9 + +if (sameString(newaa->dna, oldaa->dna)) + { + // synonymous change + } +else + { + // non-synonymous change + struct gpFx *effects; + AllocVar(effects); + effects->so.soNumber = non_synonymous_variant; + effects->so.sub.codingChange.transcript = cloneString(pred->name); + effects->so.sub.codingChange.exonNumber = exonNum; + effects->so.sub.codingChange.cDnaPosition = firstChange( newSequence, + transcriptSequence->dna); + effects->so.sub.codingChange.cdsPosition = firstChange( newCodingSequence, + oldCodingSequence); + effects->so.sub.codingChange.pepPosition = firstChange( newaa->dna, + oldaa->dna); + /* + char *aaChanges; + char *codonChanges; + */ + slAddHead(&effectsList, effects); + } + +return effectsList; +} + + +struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, + int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) { struct gpFx *effectsList = NULL; if (positiveRangeIntersection(pred->cdsStart, pred->cdsEnd, - variant->chromStart, variant->chromEnd)) + allele->variant->chromStart, allele->variant->chromEnd)) { // we're in CDS - effectsList = slCat(effectsList, gpFxInCodingExon(variant, pred, exonNum, - returnTranscript, returnCoding)); + effectsList = slCat(effectsList, gpFxInCodingExon(allele, pred, exonNum, + transcriptPsl, transcriptSequence)); } if (positiveRangeIntersection(pred->txStart, pred->cdsStart, - variant->chromStart, variant->chromEnd)) + allele->variant->chromStart, allele->variant->chromEnd)) { // we're in 5' UTR ( or UTR intron ) } if (positiveRangeIntersection(pred->txStart, pred->cdsStart, - variant->chromStart, variant->chromEnd)) + allele->variant->chromStart, allele->variant->chromEnd)) { // we're in 3' UTR } return effectsList; } +struct psl *genePredToPsl(struct genePred *pred) +{ +int qSize = genePredBases(pred); +#define BOGUS_CHROM_SIZE 0 +struct psl *psl = pslNew(pred->name, qSize, 0, qSize, + pred->chrom, BOGUS_CHROM_SIZE, + pred->txStart, pred->txEnd, + pred->strand, pred->exonCount, 0); +psl->match = psl->qSize; + +int i, qNext = 0; +for (i = 0; i < pred->exonCount; i++) + { + int exonSize = pred->exonEnds[i] - pred->exonStarts[i]; + + psl->qStarts[i] = qNext; + //psl->tStarts[i] = pred->exonStarts[i] + pred->txStart; + psl->tStarts[i] = pred->exonStarts[i]; + psl->blockSizes[i] = exonSize; + +/* notnow + if (i > 0) + { + psl->tNumInsert += 1; + psl->tBaseInsert += psl->tStarts[i] - pslTEnd(psl, i-1); + } +*/ + + psl->blockCount++; + qNext += psl->blockSizes[i]; + } + +return psl; +} static struct gpFx *gpFxCheckExons(struct variant *variant, - struct genePred *pred, char **returnTranscript, char **returnCoding) + struct genePred *pred, struct dnaSeq *transcriptSequence) // check to see if the variant is in an exon { int ii; struct gpFx *effectsList = NULL; +struct psl *transcriptPsl = genePredToPsl(pred); +// should copy transcriptSequence if we have more than one variant!! for(; variant ; variant = variant->next) { + struct allele *allele = variant->alleles; + for(; allele ; allele = allele->next) + { for(ii=0; ii < pred->exonCount; ii++) { // check if in an exon - if (positiveRangeIntersection(pred->exonStarts[ii], pred->exonEnds[ii], - variant->chromStart, variant->chromEnd)) + int size; + if ((size = positiveRangeIntersection(pred->exonStarts[ii], + pred->exonEnds[ii], + variant->chromStart, variant->chromEnd)) > 0) { - // we know we've changed the transcript, start to track it - effectsList = slCat(effectsList, gpFxInExon(variant, pred, ii, - returnTranscript, returnCoding)); + if (size != variant->chromEnd - variant->chromStart) + errAbort("variant crosses exon boundary"); + + effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii, + transcriptPsl, transcriptSequence)); + } } } } return effectsList; } static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred) // check to see if a single variant is in an exon or an intron { int ii; struct gpFx *effectsList = NULL; for(ii=0; ii < pred->exonCount - 1; ii++) @@ -133,42 +279,42 @@ static void checkVariantList(struct variant *variant) // check to see that we either have one variant (possibly with multiple // alleles) or that if we have a list of variants, they only have // one allele a piece. { if (variant->next == NULL) // just one variant return; for(; variant; variant = variant->next) if (variant->numAlleles != 1) errAbort("gpFxPredEffect needs either 1 variant, or only 1 allele in all variants"); } struct gpFx *gpFxPredEffect(struct variant *variant, struct genePred *pred, - char **returnTranscript, char **returnCoding) + struct dnaSeq *transcriptSequence) // return the predicted effect(s) of a variation list on a genePred { struct gpFx *effectsList = NULL; // make sure we can deal with the variants that are coming in checkVariantList(variant); // check to see if SNP is up or downstream in intron effectsList = slCat(effectsList, gpFxCheckBackground(variant, pred)); // check to see if SNP is in the transcript effectsList = slCat(effectsList, gpFxCheckExons(variant, pred, - returnTranscript, returnCoding)); + transcriptSequence)); if (effectsList != NULL) return effectsList; // default is no effect struct gpFx *noEffect; AllocVar(noEffect); noEffect->next = NULL; ;//oEffect->gpFxType = gpFxNone; return noEffect; }