5c583c55ff0181c5f12ee21e6aa73bda6a5e7079 braney Mon May 21 14:47:12 2012 -0700 support for additional indel types in gpFx (#6152) diff --git src/hg/lib/gpFx.c src/hg/lib/gpFx.c index 1689539..d059d01 100644 --- src/hg/lib/gpFx.c +++ src/hg/lib/gpFx.c @@ -1,120 +1,149 @@ /* gpFx --- routines to calculate the effect of variation on a genePred */ #include "common.h" #include "genePred.h" #include "gpFx.h" - -unsigned countDashes(char *string) +static unsigned countDashes(char *string) +/* count the number of dashes in a string */ { int count = 0; while(*string) { if (*string == '-') count++; string++; } return count; } -static void mergeAllele(char *transcript, int variantWidth, +static char * mergeAllele(char *transcript, int offset, int variantWidth, char *newAlleleSeq, int alleleLength) +/* merge a variant into an allele */ { +char *newTranscript = transcript; + if (variantWidth == alleleLength) { // for the moment, we're sticking the dashes into the transcripts if (1) //noDashes(newAlleleSeq)) - memcpy(transcript, newAlleleSeq, alleleLength); + { + memcpy(&transcript[offset], newAlleleSeq, alleleLength); + } +#ifdef DOESNTKNOWABOUTTHEOFFSETARGUMENT + // if we decide not to put the dashes into the resulting transcripts + // this will have to be re-written to grok the offset argument else { char *transcriptSource = transcript; int ii; for(ii=0; ii < variantWidth; ii++) { if (*newAlleleSeq == '-') { transcriptSource++; } else { *transcript++ = *newAlleleSeq++; transcriptSource++; } } while(*transcriptSource) *transcript++ = *transcriptSource++; *transcript = 0; } +#endif } +else if (variantWidth > alleleLength) + errAbort("allele not padded"); +else + { + int insertionSize = alleleLength - variantWidth; + int newLength = strlen(transcript) + insertionSize; + newTranscript = needMem(newLength + 1); + char *restOfTranscript = &transcript[offset + variantWidth]; + + // copy over the part before the variant + memcpy(newTranscript, transcript, offset); + + // copy in the new allele + memcpy(&newTranscript[offset], newAlleleSeq, alleleLength); + + // copy in the new allele + memcpy(&newTranscript[offset + alleleLength], restOfTranscript, + strlen(restOfTranscript) + 1); + } + +return newTranscript; } char *gpFxModifySequence(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* modify a transcript to what it'd be if the alternate allele were present */ { // clip allele to exon struct allele *clipAllele = alleleClip(allele, transcriptPsl->tStarts[exonNum], transcriptPsl->tStarts[exonNum] + transcriptPsl->blockSizes[exonNum]); // change transcript at variant point int exonOffset = clipAllele->variant->chromStart - transcriptPsl->tStarts[exonNum]; int transcriptOffset = transcriptPsl->qStarts[exonNum] + exonOffset; int variantWidth = clipAllele->variant->chromEnd - clipAllele->variant->chromStart; -if (clipAllele->length != variantWidth) - errAbort("only support alleles the same length as the reference"); char *retSequence = cloneString(transcriptSequence->dna); char *newAlleleSeq = cloneString(clipAllele->sequence); if (*pred->strand == '-') { transcriptOffset = transcriptSequence->size - (transcriptOffset + strlen(newAlleleSeq)); reverseComplement(newAlleleSeq, strlen(newAlleleSeq)); } // make the change in the sequence -mergeAllele( &retSequence[transcriptOffset], variantWidth, newAlleleSeq, allele->length); +retSequence = mergeAllele( retSequence, transcriptOffset, variantWidth, newAlleleSeq, allele->length); // clean up freeMem(newAlleleSeq); return retSequence; } static char *getCodingSequence(struct genePred *pred, char *transcriptSequence) /* extract the CDS from a transcript */ { int ii; if (*pred->strand == '-') reverseComplement(transcriptSequence, strlen(transcriptSequence)); // trim off the 5' UTR ( or 3' if on the minus strand) char *ptr = transcriptSequence; for(ii=0; ii < pred->exonCount; ii++) { int exonSize = pred->exonEnds[ii] - pred->exonStarts[ii]; - if ((pred->cdsStart > pred->exonStarts[ii]) && + if ((pred->cdsStart >= pred->exonStarts[ii]) && (pred->cdsStart < pred->exonEnds[ii])) break; ptr += exonSize; } +assert (ii != pred->exonCount); // clip off part of UTR in exon that has CDS in it too int exonOffset = pred->cdsStart - pred->exonStarts[ii]; ptr += exonOffset; char *newString = cloneString(ptr); // trim off 3' (or 5' if on minus strand) newString[genePredCdsSize(pred)] = 0; // correct for strand if (*pred->strand == '-') { reverseComplement(transcriptSequence, strlen(transcriptSequence)); reverseComplement(newString, strlen(newString)); @@ -189,39 +218,30 @@ { struct dyString *dy = newDyString(100); int ii; for(ii=0; ii < numaa; ii++) { dyStringPrintf(dy, "%c > %c", toupper(oldaa[pepPosition+ii]), toupper(newaa[pepPosition+ii])); if (ii < numaa - 1) dyStringPrintf(dy, ","); } return dy->string; } -struct gpFx *gpFxInCdsDeletion( struct allele *allele, struct genePred *pred, - int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, - char *newSequence) -{ -struct gpFx *effects; -AllocVar(effects); -return effects; -} - struct gpFx *gpFxCheckUtr( struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence, char *newSequence) /* check for effects in UTR of coding gene */ { if (positiveRangeIntersection(pred->txStart, pred->cdsStart, allele->variant->chromStart, allele->variant->chromEnd)) { // we're in 5' UTR ( or UTR intron ) errAbort("don't support variants in 5' UTR"); } if (positiveRangeIntersection(pred->txStart, pred->cdsStart, allele->variant->chromStart, allele->variant->chromEnd)) { @@ -252,31 +272,30 @@ effectsList = gpFxCheckUtr(allele, pred, exonNum, transcriptPsl, transcriptSequence, newSequence); // check to see if coding sequence is changed // calculate original and variant coding AA's char *oldCodingSequence, *newCodingSequence; aaSeq *oldaa, *newaa; getSequences(pred, transcriptSequence->dna, &oldCodingSequence, &oldaa); getSequences(pred, newSequence, &newCodingSequence, &newaa); // if coding region hasn't changed, we're done if (sameString(oldCodingSequence, newCodingSequence)) return effectsList; - // start allocating the effect structure struct gpFx *effects; AllocVar(effects); slAddHead(&effectsList, effects); struct codingChange *cc = &effects->so.sub.codingChange; cc->transcript = cloneString(pred->name); int dnaChangeLength; cc->cDnaPosition = firstChange( newSequence, transcriptSequence->dna, &dnaChangeLength); int cdsChangeLength; cc->cdsPosition = firstChange( newCodingSequence, oldCodingSequence, &cdsChangeLength); if (*pred->strand == '-') cc->exonNumber = pred->exonCount - exonNum; else cc->exonNumber = exonNum; @@ -291,37 +310,43 @@ cc->aaChanges = ""; int numDashes; if (sameString(newaa->dna, oldaa->dna)) { // synonymous change effects->so.soNumber = synonymous_variant; } else { int numDifferent; cc->pepPosition = firstChange( newaa->dna, oldaa->dna, &numDifferent); cc->aaChanges = aaChangeString( cc->pepPosition, numDifferent, oldaa->dna, newaa->dna); + // this is assuming that deletions are marked with dashes + // in newCodingSequence if ((numDashes = countDashes(newCodingSequence)) != 0) { if ((numDashes % 3) == 0) effects->so.soNumber = inframe_deletion; else effects->so.soNumber = frameshift_variant; } + else if (strlen(newaa->dna) != strlen(oldaa->dna)) + { + effects->so.soNumber = inframe_insertion; + } else { // non-synonymous change effects->so.soNumber = non_synonymous_variant; } } return effectsList; } struct gpFx *gpFxInExon(struct allele *allele, struct genePred *pred, int exonNum, struct psl *transcriptPsl, struct dnaSeq *transcriptSequence) /* generate an effect from a different allele in an exon */ { @@ -384,36 +409,39 @@ { int ii; struct gpFx *effectsList = NULL; struct psl *transcriptPsl = genePredToPsl(pred); // should copy transcriptSequence if we have more than one variant!! for(; variant ; variant = variant->next) { struct allele *allele = variant->alleles; for(; allele ; allele = allele->next) { for(ii=0; ii < pred->exonCount; ii++) { // check if in an exon int size; + int end = variant->chromEnd; + if (variant->chromStart == end) + end++; if ((size = positiveRangeIntersection(pred->exonStarts[ii], pred->exonEnds[ii], - variant->chromStart, variant->chromEnd)) > 0) + variant->chromStart, end)) > 0) { - if (size != variant->chromEnd - variant->chromStart) + if (size != end - variant->chromStart) errAbort("variant crosses exon boundary"); effectsList = slCat(effectsList, gpFxInExon(allele, pred, ii, transcriptPsl, transcriptSequence)); } } } } return effectsList; } static struct gpFx *gpFxCheckIntrons(struct variant *variant, struct genePred *pred) // check to see if a single variant is in an exon or an intron