f7792e45d8c710b375ed418f55f759a13c545421 markd Thu Jun 1 23:11:14 2017 -0700 Moved guts of genePredToProt to library to allow Brian to use it in CGIs. Removed unnecessary PSL creation from genePredToProt. diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 26724d2..98fec64 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -2243,15 +2243,170 @@ gp->exonStarts[ii] += gp->txStart; gp->exonEnds[ii] += gp->exonStarts[ii]; } gp->name2 = cloneString(row[12]); gp->cdsStartStat = parseCdsStat(row[13]); gp->cdsEndStat = parseCdsStat(row[14]); gp->optFields |= genePredCdsStatFld; sqlSignedDynamicArray(row[15], &gp->exonFrames, &numBlocks); assert (numBlocks == gp->exonCount); gp->optFields |= genePredExonFramesFld; gp->type = cloneString(row[16]); gp->geneName = cloneString(row[17]); gp->geneName2 = cloneString(row[18]); return gp; } + +struct cds +/* CDS sequence being assembled */ +{ + char *bases; // CDS string being built + int iCds; // Index into CDS + int nextFrame; // next required frame +}; + +static struct dnaSeq* getGeneRegion(struct nibTwoCache* genomeSeqs, struct genePred *gp, int start, int end, + int *chromSize) +/* Get the genome sequence covering the a region of a gene, in transcription order */ +{ +struct dnaSeq *dna = nibTwoCacheSeqPart(genomeSeqs, gp->chrom, start, end-start, chromSize); +if (gp->strand[0] == '-') + reverseComplement(dna->dna, dna->size); +return dna; +} + +static void removePartialCodon(struct cds* cds) +/* remove partial codon that has already been copied to CDS */ +{ +int iCdsNew = cds->iCds - cds->nextFrame; +if (iCdsNew < 0) + iCdsNew = 0; +zeroBytes(cds->bases+iCdsNew, (cds->iCds - iCdsNew)); +cds->iCds = iCdsNew; +cds->nextFrame = 0; +} + +static void addCdsExonBases(struct nibTwoCache* genomeSeqs, struct genePred *genePred, + int exonCdsStart, int exonCdsEnd, struct cds* cds, int *chromSize) +{ +struct dnaSeq *dna = getGeneRegion(genomeSeqs, genePred, exonCdsStart, exonCdsEnd, chromSize); +int iDna = 0; +int iCdsStart = cds->iCds; +while (iDna < dna->size) + cds->bases[cds->iCds++] = dna->dna[iDna++]; +cds->nextFrame = ((cds->nextFrame) + (cds->iCds - iCdsStart)) % 3; +dnaSeqFree(&dna); +} + +static void addCdsExon(struct nibTwoCache* genomeSeqs, struct genePred *gp, + int exonCdsStart, int exonCdsEnd, int frame, + struct cds* cds) +/* get CDS from one exon */ +{ +// adjust for frame shift, dropping partial codon +if (frame != cds->nextFrame) + { + removePartialCodon(cds); + if (gp->strand[0] == '+') + exonCdsStart += frame; + else + exonCdsEnd -= frame; + } +if (exonCdsStart < exonCdsEnd) + { + int chromSize = 0; + addCdsExonBases(genomeSeqs, gp, exonCdsStart, exonCdsEnd, cds, &chromSize); + } +} + +static char* getCdsCodons(struct genePred *gp, struct nibTwoCache* genomeSeqs) +/* get the CDS sequence, dropping partial codons */ +{ +struct cds cds; +cds.bases = needMem(genePredCdsSize(gp) + 1); +cds.iCds = 0; +cds.nextFrame = 0; + +// walk in direction of transcription +int dir = (gp->strand[0] == '+') ? 1 : -1; +int iExon = (dir > 0) ? 0 : gp->exonCount-1; +int iStop = (dir > 0) ? gp->exonCount : -1; +int exonCdsStart, exonCdsEnd; +for (; iExon != iStop; iExon += dir) + { + if (genePredCdsExon(gp, iExon, &exonCdsStart, &exonCdsEnd)) + addCdsExon(genomeSeqs, gp, exonCdsStart, exonCdsEnd, gp->exonFrames[iExon], &cds); + } +if (cds.nextFrame != 0) + removePartialCodon(&cds); +assert((strlen(cds.bases) % 3) == 0); // ;-) +return cds.bases; +} + +static char translateCodon(boolean isChrM, char* codon, bool lastCodon, unsigned options) +/* translate the first three bases starting at codon, handling weird + * biology as requested giving */ +{ +char aa = isChrM ? lookupMitoCodon(codon) : lookupCodon(codon); +if (aa == '\0') + { + // stop, contains `N' or selenocysteine + boolean isStopOrSelno = isStopCodon(codon); + boolean isRealStop = isReallyStopCodon(codon, !lastCodon); // internal could be selenocysteine + if (lastCodon) + { + if ((options & GENEPRED_TRANSLATE_INCLUDE_STOP) != 0) + aa = '*'; + else if (!isRealStop) + aa = 'X'; + // others, \0' will terminate + } + else if (((options & GENEPRED_TRANSLATE_SELENO) != 0) + && isStopOrSelno && !isRealStop) + aa = 'U'; + else if (isRealStop && ((options & GENEPRED_TRANSLATE_STAR_INFRAME_STOPS) != 0)) + aa = '*'; + else + aa = 'X'; + } +return aa; +} + +static char* translateCds(char* chrom, char* cds, unsigned options) +/* translate the CDS */ +{ +int cdsLen = strlen(cds); +char *prot = needMem((cdsLen/3)+1); +boolean isChrM = sameString(chrom, "chrM"); +int iCds, iProt; +for (iCds = 0, iProt = 0; iCds < cdsLen; iCds+=3, iProt++) + prot[iProt] = translateCodon(isChrM, cds+iCds, (iCds == cdsLen-3), options); +return prot; +} + +void genePredTranslate(struct genePred *gp, struct nibTwoCache* genomeSeqs, unsigned options, + char **protRet, char **cdsRet) +/* Translate a genePred into a protein. It can also return the CDS part of the + * mRNA sequence. If the chrom is chrM, the mitochondrial translation tables are + * used. If protRet or cdsRet is NULL, those sequences are not returned. + */ +{ +// note: code tests by genePredToProt +bool haveFrames = (gp->exonFrames != NULL); +if (!haveFrames) + genePredAddExonFrames(gp); // assume correct frame if not included + +char* cds = getCdsCodons(gp, genomeSeqs); +char *prot = translateCds(gp->chrom, cds, options); + +if (protRet != NULL) + *protRet = prot; +else + freeMem(prot); +if (cdsRet != NULL) + *cdsRet = cds; +else + freeMem(cds); + +if (!haveFrames) + freez(&gp->exonFrames); +}