e6cd18ccbe6e255de8cb54bc70da0e42fe9a73b5 markd Mon Aug 17 19:17:10 2020 -0700 add report of location of frameshifts for use by CAT diff --git src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c index eb949c5..9773bff 100644 --- src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c +++ src/hg/utils/transMapPslToGenePred/transMapPslToGenePred.c @@ -40,38 +40,41 @@ "This is an alternative to mrnaToGene which determines CDS and frame from\n" "the original annotation, which may have been imported from GFF/GTF. This\n" "was created because the genbankCds structure use by mrnaToGene doesn't\n" "handle partial start/stop codon or programmed frame shifts. This requires\n" "handling the list of CDS regions and the /codon_start attribute, At some\n" "point, this program may be extended to do handle genbank alignments correctly.\n" "\n" "Options:\n" " -nonCodingGapFillMax=0 - fill gaps in non-coding regions up to this many bases\n" " in length.\n" " -codingGapFillMax=0 - fill gaps in coding regions up to this many bases\n" " in length. Only coding gaps that are a multiple of three will be fill,\n" " with the max rounded down.\n" " -noBlockMerge - don't do any block merging of genePred, even of adjacent blocks.\n" " This is mainly for debugging.\n" + " -frameShifts=tsv - Write TSV with locations of frame-shifting indels. The coordinates\n" + " give a context for the shift for browsing, not an exact location.\n" "\n"); } /* command line option specifications */ static struct optionSpec optionSpecs[] = { {"nonCodingGapFillMax", OPTION_INT}, {"codingGapFillMax", OPTION_INT}, {"noBlockMerge", OPTION_BOOLEAN}, + {"frameShifts", OPTION_STRING}, {NULL, 0} }; static int nonCodingGapFillMax = 0; static int codingGapFillMax = 0; static boolean noBlockMerge = FALSE; static void swapBoolean(boolean *a, boolean *b) /* swap two booleans */ { boolean hold = *a; *a = *b; *b = hold; } static int frameIncr(int frame, int amt) @@ -83,39 +86,30 @@ else if (amt >= 0) return (frame + amt) % 3; else { int amt3 = (-amt) % 3; return (frame - (amt - amt3)) % 3; } } static int genePredExonSize(struct genePred* gp, int iExon) /* calculate size of an exon in a genePred */ { return gp->exonEnds[iExon] - gp->exonStarts[iExon]; } -static int genePredSize(struct genePred* gp) -/* calculate size of an all of the exons in a genePred */ -{ -int iExon, size = 0; -for (iExon = 0; iExon < gp->exonCount; iExon++) - size += genePredExonSize(gp, iExon); -return size; -} - static int pslBlockQueryToTarget(struct psl *psl, int iBlock, int pos) /* project a position from query to target. Position maybe a start * or end */ { assert((pslQStart(psl, iBlock) <= pos) && (pos <= pslQEnd(psl, iBlock))); return pslTStart(psl, iBlock) + pos - pslQStart(psl, iBlock); } static boolean srcGenePredCheckSameStruct(struct genePred *gp, struct genePred *gp0) /* check that duplicate genePred frames and exon sizes are the same (PAR * case). */ { int iExon; if (gp->exonCount != gp0->exonCount) @@ -150,42 +144,68 @@ } static struct hash* srcGenePredLoad(char* srcGenePredFile) /* load genepreds into a hash by name */ { struct hash* srcGenePredMap = hashNew(0); struct genePred *gp, *gps = genePredReaderLoadFile(srcGenePredFile, NULL); while ((gp = slPopHead(&gps)) != NULL) srcGenePredAdd(srcGenePredMap, gp); return srcGenePredMap; } static struct genePred* srcGenePredFind(struct hash* srcGenePredMap, char* qName, int qSize) { -// ignore unique suffix everything after the last -char qNameBase[512]; -safecpy(qNameBase, sizeof(qNameBase), qName); -char *dash = strrchr(qNameBase, '-'); +// try looking up with or without the unique suffix, which are strings +// appended like '-1' or '-1.2' +boolean haveSuffix = strrchr(qName, '-') != NULL; + +struct genePred* gp = NULL; +if (!haveSuffix) + { + gp = hashFindVal(srcGenePredMap, qName); + if (gp == NULL) + errAbort("can't find source genePred for %s", qName); + } +else + { + // try removing single `.N' which would be added during mapping + char qNameBase1[512]; + safecpy(qNameBase1, sizeof(qNameBase1), qName); + char *dot = strrchr(qNameBase1, '.'); + if (dot != NULL) + { + *dot = '\0'; + gp = hashFindVal(srcGenePredMap, qNameBase1); + } + if (gp == NULL) + { + // try removing full unique extension + char qNameBase2[512]; + safecpy(qNameBase2, sizeof(qNameBase2), qName); + char *dash = strrchr(qNameBase2, '-'); if (dash != NULL) *dash = '\0'; -struct genePred* gp = hashFindVal(srcGenePredMap, qNameBase); + gp = hashFindVal(srcGenePredMap, qNameBase2); if (gp == NULL) - errAbort("can't find source genePred for %s with name %s", qName, qNameBase); -if (genePredSize(gp) != qSize) + errAbort("can't find source genePred for %s with name %s or %s", qName, qNameBase1, qNameBase2); + } + } +if (genePredBases(gp) != qSize) errAbort("size of query computed from genePred %s (%d), doesn't match expected %s (%d)", - qNameBase, genePredSize(gp), qName, qSize); + gp->name, genePredBases(gp), qName, qSize); return gp; } struct range /* A range for function returns */ { int start; int end; }; struct srcQueryExon /* source query exon ranges, cds, and frame. These are all in to positive * strand coordinates. */ { int qStart; // computed query range of exon @@ -260,31 +280,31 @@ assert(srcQueryExon->qCdsStart < srcQueryExon->qCdsEnd); assert(srcQueryExon->qStart <= srcQueryExon->qCdsStart); assert(srcQueryExon->qCdsEnd <= srcQueryExon->qEnd); assert(srcQueryExon->frame >= 0); } return qEnd; } static void srcQueryExonBuild(struct genePred* srcGp, struct srcQueryExon* srcQueryExons) /* create a list of query positions to frames. Array should hold all * be the length of the src */ { // building in strand-order generates correct coordinate int qStart = 0; -int qSize = genePredSize(srcGp); +int qSize = genePredBases(srcGp); int iExon; if (srcGp->strand[0] == '+') { for (iExon = 0; iExon < srcGp->exonCount; iExon++) qStart = srcQueryExonMake(srcGp, qStart, iExon, qSize, &(srcQueryExons[iExon])); } else { for (iExon = srcGp->exonCount-1; iExon >= 0; iExon--) qStart = srcQueryExonMake(srcGp, qStart, iExon, qSize, &(srcQueryExons[iExon])); } } struct srcQueryExon srcQueryExonReverse(struct srcQueryExon* srcQueryExon) /* reverse a srcQueryExon structure */ @@ -477,63 +497,103 @@ genePredAllFlds, currentExonSpace); mappedGp->score = srcGp->score; mappedGp->name2 = cloneString(srcGp->name2); int iBlock; for (iBlock = 0; iBlock < mappedPsl->blockCount; iBlock++) convertPslBlock(mappedPsl, iBlock, srcGp, srcQueryExons, mappedGp, ¤tExonSpace, &mappedCdsBounds); if (mappedCdsBounds.cdsStart < 0) finishWithOutCds(mappedGp); else finishWithCds(srcGp, mappedGp, &mappedCdsBounds); return mappedGp; } -static boolean haveAdjacentMergedFrames(struct genePred *mappedGp, int iExon) -/* Do two block have adjacent frames and will merging them keep frames contiguous? */ +static int getExonCdsStart(struct genePred *gp, int iExon) +/* get start of CDS in an exon */ +{ +return max(gp->exonStarts[iExon], gp->cdsStart); +} + +static int getExonCdsEnd(struct genePred *gp, int iExon) +/* get end of CDS in an exon */ +{ +return min(gp->exonEnds[iExon], gp->cdsEnd); +} + +static int getExonCdsLen(struct genePred *gp, int iExon) +/* get the length of the the CDS in an exon */ +{ +int start, end; +genePredCdsExon(gp, iExon, &start, &end); +return end - start; +} + + +static int getExonEndFrame(struct genePred *gp, int iExon) +/* get the end frame of exon (half-open) */ +{ +if (gp->exonFrames[iExon] == -1) + return -1; +else + return frameIncr(gp->exonFrames[iExon], getExonCdsLen(gp, iExon)); +} + +static int codonBaseRoundDown(int bases) +/* round down to an even multiple of three */ +{ +return 3 * (bases / 3); +} + +static int wholeCodonBases(struct genePred *gp, int iExon) +/* the number of whole codon bases in an exon */ +{ +return codonBaseRoundDown(getExonCdsLen(gp, iExon) - gp->exonFrames[iExon]); +} + +static boolean haveMergableFrames(struct genePred *mappedGp, int iExon) +/* Do two block have frames that can be merged keeping frames contiguous? */ { assert(mappedGp->exonFrames[iExon] >= 0); assert(mappedGp->exonFrames[iExon+1] >= 0); int gapSize = (mappedGp->exonStarts[iExon+1] - mappedGp->exonEnds[iExon]); if (mappedGp->strand[0] == '+') { // how much of CDS is in this exon - int cdsOff = max(mappedGp->exonStarts[iExon], mappedGp->cdsStart) - mappedGp->exonStarts[iExon]; - int cdsIncr = (genePredExonSize(mappedGp, iExon) - cdsOff) + gapSize; + int cdsIncr = getExonCdsLen(mappedGp, iExon) + gapSize; return (frameIncr(mappedGp->exonFrames[iExon], cdsIncr) == mappedGp->exonFrames[iExon+1]); } else { // how much of CDS is in next exon - int cdsOff = mappedGp->exonEnds[iExon+1] - min(mappedGp->exonEnds[iExon+1], mappedGp->cdsEnd); - int cdsIncr = (genePredExonSize(mappedGp, iExon+1) - cdsOff) + gapSize; + int cdsIncr = getExonCdsLen(mappedGp, iExon+1) + gapSize; return (frameIncr(mappedGp->exonFrames[iExon+1], cdsIncr) == mappedGp->exonFrames[iExon]); } } static boolean hasAdjacentNonCoding(struct genePred *mappedGp, int iExon) /* are one or both adjacent blocks non-coding? */ { return (mappedGp->exonFrames[iExon] == -1) || (mappedGp->exonFrames[iExon+1] == -1); } static boolean canMergeFrames(struct genePred *mappedGp, int iExon) /* check if frames can be merged, -1 frames merge with with any adjacent * frame */ { -return hasAdjacentNonCoding(mappedGp, iExon) || haveAdjacentMergedFrames(mappedGp, iExon); +return hasAdjacentNonCoding(mappedGp, iExon) || haveMergableFrames(mappedGp, iExon); } static boolean canMergeBlocks(struct genePred *mappedGp, int iExon) /* check if blocks are adjacent and can be merged with consistent frame */ { int maxGapFill = hasAdjacentNonCoding(mappedGp, iExon) ? nonCodingGapFillMax : codingGapFillMax; return ((mappedGp->exonStarts[iExon+1] - mappedGp->exonEnds[iExon]) <= maxGapFill) && canMergeFrames(mappedGp, iExon); } static void mergeAdjacentFrames(struct genePred *mappedGp, int iExon) /* merge frames for two adjacent blocks that are being merged */ { // update frame, handling case were one block has CDS and the other doesn't @@ -574,74 +634,202 @@ } static void mergeGenePredBlocks(struct genePred *mappedGp) /* merge adjacent genePred `exons' as long as they have consistent frame */ { int iExon = 0; while (iExon < mappedGp->exonCount-1) { if (canMergeBlocks(mappedGp, iExon)) mergeAdjacentBlocks(mappedGp, iExon); else iExon++; } } -static void convertPsl(struct hash* srcGenePredMap, struct psl *mappedPsl, FILE* genePredOutFh) +static void prFrameShift(struct genePred *mappedGp, int chromStart, int chromEnd, + int shift, char *location, FILE *frameShiftsFh) +/* print a frame description to the TSV */ +{ +fprintf(frameShiftsFh, "%s\t%s\t%d\t%d\t%s\t%d\t%s\n", mappedGp->name, mappedGp->chrom, + chromStart, chromEnd, mappedGp->strand, shift, location); +} + +static boolean haveConsistentFrames(struct genePred *mappedGp, int iExon) +/* is frame between iExon and iExon+1 consistent (no frame shift)? */ +{ +if (mappedGp->strand[0] == '+') + return getExonEndFrame(mappedGp, iExon) == mappedGp->exonFrames[iExon+1]; +else + return getExonEndFrame(mappedGp, iExon+1) == mappedGp->exonFrames[iExon]; +} + +static void checkStartFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh) +/* check if first codon is truncated at the start */ +{ +if (mappedGp->exonFrames[iExon] != 0) + { + int start, end; + if (mappedGp->strand[0] == '+') + { + start = getExonCdsStart(mappedGp, iExon); + end = min(start + (3 - mappedGp->exonFrames[iExon]), + mappedGp->exonEnds[iExon]); + } + else + { + end = getExonCdsEnd(mappedGp, iExon); + start = max(end - (3 - mappedGp->exonFrames[iExon]), + mappedGp->exonStarts[iExon]); + } + prFrameShift(mappedGp, start, end, mappedGp->exonFrames[iExon], "first", frameShiftsFh); + } +} + +static void checkEndFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh) +/* check if last codon is truncated, causing a frameshift */ +{ +if (getExonEndFrame(mappedGp, iExon) != 0) + { + int start, end; + if (mappedGp->strand[0] == '+') + { + end = getExonCdsEnd(mappedGp, iExon); + start = max(end - (3 - mappedGp->exonFrames[iExon]), + mappedGp->exonStarts[iExon]); + } + else + { + start = getExonCdsStart(mappedGp, iExon); + end = min(start + (3 - mappedGp->exonFrames[iExon]), + mappedGp->exonEnds[iExon]); + } + int shift = getExonCdsLen(mappedGp, iExon) - mappedGp->exonFrames[iExon] - wholeCodonBases(mappedGp, iExon); + prFrameShift(mappedGp, start, end, shift, "last", frameShiftsFh); + } +} + +static void checkInternalFrameShift(struct genePred *mappedGp, int iExon, FILE* frameShiftsFh) +/* check if there is a frame shift between iExon and iExon+1 */ +{ +if (!haveConsistentFrames(mappedGp, iExon)) + { + // get full codons before and after as bounds + int start, end, shift; + if (mappedGp->strand[0] == '+') + { + start = max(getExonCdsStart(mappedGp, iExon) + mappedGp->exonFrames[iExon] + wholeCodonBases(mappedGp, iExon) - 3, + mappedGp->exonStarts[iExon]); + end = min(mappedGp->exonStarts[iExon+1] + mappedGp->exonFrames[iExon+1] + 3, + mappedGp->exonEnds[iExon+1]); + shift = abs(frameIncr(mappedGp->exonFrames[iExon], getExonCdsLen(mappedGp, iExon)) - mappedGp->exonFrames[iExon+1]); + } + else + { + start = max(getExonCdsStart(mappedGp, iExon) + getExonEndFrame(mappedGp, iExon) + wholeCodonBases(mappedGp, iExon) - 3, + mappedGp->exonStarts[iExon]); + end = min(mappedGp->exonStarts[iExon+1] + (3 - mappedGp->exonFrames[iExon+1]), + mappedGp->exonEnds[iExon+1]); + shift = abs(frameIncr(mappedGp->exonFrames[iExon+1], getExonCdsLen(mappedGp, iExon+1)) - mappedGp->exonFrames[iExon]); + } + prFrameShift(mappedGp, start, end, shift, "internal", frameShiftsFh); + } +} + +static void reportFrameShifts(struct genePred *mappedGp, FILE* frameShiftsFh) +/* report frame shifts found it a transcript, including truncated start and end */ +{ +assert(mappedGp->cdsStart < mappedGp->cdsEnd); + +int iExon = 0; + +// find first with frame +for (; mappedGp->exonFrames[iExon] == -1; iExon++) + continue; + +// first codon +if (mappedGp->strand[0] == '+') + checkStartFrameShift(mappedGp, iExon, frameShiftsFh); +else + checkEndFrameShift(mappedGp, iExon, frameShiftsFh); + +// internal exons +for (; (iExon < mappedGp->exonCount - 1) && (mappedGp->exonFrames[iExon + 1] > -1); iExon++) + checkInternalFrameShift(mappedGp, iExon, frameShiftsFh); + +// last codon +if (mappedGp->strand[0] == '+') + checkEndFrameShift(mappedGp, iExon, frameShiftsFh); +else + checkStartFrameShift(mappedGp, iExon, frameShiftsFh); +} + +static void convertPsl(struct hash* srcGenePredMap, struct psl *mappedPsl, FILE* genePredOutFh, + FILE* frameShiftsFh) /* convert a single PSL */ { if (pslTStrand(mappedPsl) == '-') pslRc(mappedPsl); // target must be `+' struct genePred* srcGp = srcGenePredFind(srcGenePredMap, mappedPsl->qName, mappedPsl->qSize); if (genePredBases(srcGp) != mappedPsl->qSize) errAbort("srcGenePred %s exon size %d does not match mappedPsl %s qSize %d", srcGp->name, genePredBases(srcGp), mappedPsl->qName, mappedPsl->qSize); struct srcQueryExon srcQueryExons[srcGp->exonCount]; srcQueryExonBuild(srcGp, srcQueryExons); struct genePred *mappedGp = createGenePred(srcGp, srcQueryExons, mappedPsl); if (!noBlockMerge) mergeGenePredBlocks(mappedGp); +if ((frameShiftsFh != NULL) && (mappedGp->cdsStart < mappedGp->cdsEnd)) + reportFrameShifts(mappedGp, frameShiftsFh); if (genePredCheck("mappedGenePred", stderr, -1, mappedGp)) errAbort("invalid genePred created"); genePredTabOut(mappedGp, genePredOutFh); genePredFree(&mappedGp); } static void transMapPslToGenePred(char* srcGenePredFile, char* mappedPslFile, - char* mappedGenePredFile) + char* mappedGenePredFile, char* frameShiftsTsv) /* convert mapped psl to genePreds */ { struct hash* srcGenePredMap = srcGenePredLoad(srcGenePredFile); struct lineFile *pslFh = pslFileOpen(mappedPslFile); FILE* genePredOutFh = mustOpen(mappedGenePredFile, "w"); +FILE* frameShiftsFh = NULL; +if (frameShiftsTsv != NULL) + { + frameShiftsFh = mustOpen(frameShiftsTsv, "w"); + fprintf(frameShiftsFh, "name\tchrom\tchromStart\tchromEnd\tstrand\tshift\tlocation\n"); + } struct psl *mappedPsl; while ((mappedPsl = pslNext(pslFh)) != NULL) { - convertPsl(srcGenePredMap, mappedPsl, genePredOutFh); + convertPsl(srcGenePredMap, mappedPsl, genePredOutFh, frameShiftsFh); pslFree(&mappedPsl); } +carefulClose(&frameShiftsFh); carefulClose(&genePredOutFh); lineFileClose(&pslFh); hashFreeWithVals(&srcGenePredMap, genePredFree); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); if (argc != 4) usage(); nonCodingGapFillMax = optionInt("nonCodingGapFillMax", nonCodingGapFillMax); codingGapFillMax = optionInt("codingGapFillMax", codingGapFillMax); codingGapFillMax = (codingGapFillMax/3)*3; // round noBlockMerge = optionExists("noBlockMerge"); char *srcGenePredFile = argv[1]; char *mappedPslFile = argv[2]; char *mappedGenePredFile = argv[3]; -transMapPslToGenePred(srcGenePredFile, mappedPslFile, mappedGenePredFile); +transMapPslToGenePred(srcGenePredFile, mappedPslFile, mappedGenePredFile, + optionVal("frameShifts", NULL)); return 0; }