bc3bbb2794e3067f0ff35ac2003fd69362e2ed58 angie Thu Mar 8 11:34:17 2018 -0800 Libified genePredToFakePsl's code for extracting PSL and CDS from genePred so I can use it to infer PSL+CDS from genePred tracks like Gencode for HGVS. refs #21076 diff --git src/hg/lib/genePred.c src/hg/lib/genePred.c index 98fec64..c66f33b 100644 --- src/hg/lib/genePred.c +++ src/hg/lib/genePred.c @@ -2398,15 +2398,89 @@ char* cds = getCdsCodons(gp, genomeSeqs); char *prot = translateCds(gp->chrom, cds, options); if (protRet != NULL) *protRet = prot; else freeMem(prot); if (cdsRet != NULL) *cdsRet = cds; else freeMem(cds); if (!haveFrames) freez(&gp->exonFrames); } + +void genePredToCds(struct genePred *gp, struct genbankCds *cds) +/* Fill in cds with transcript offsets computed from genePred. */ +{ +/* + * Warning: Genbank CDS does't have the ability to represent + * partial codons. If we have genePreds created from GFF/GTF, they can have + * partial codons, which is indicated in frame. This code does not correctly handle + * this case, or frame shifting indels. + */ +cds->start = cds->end = -1; +cds->startComplete = cds->endComplete = cds->complement = FALSE; +if (gp->cdsEnd > gp->cdsStart) + { + int e, off = 0; + int qCdsStart = -1, qCdsEnd = -1; + for (e = 0; e < gp->exonCount; ++e) + { + int eCdsStart, eCdsEnd; + if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd)) + { + if (qCdsStart < 0) + qCdsStart = off + (eCdsStart - gp->exonStarts[e]); + qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]); + } + off += gp->exonEnds[e] - gp->exonStarts[e]; + } + int qSize = off; + if (gp->strand[0] == '-') + reverseIntRange(&qCdsStart, &qCdsEnd, qSize); + cds->start = qCdsStart; + cds->end = qCdsEnd; + cds->startComplete = (gp->cdsStartStat != cdsIncomplete); + cds->endComplete = (gp->cdsEndStat != cdsIncomplete); + } +} + +struct psl *genePredToPsl(struct genePred *gp, int chromSize, int qSize) +/* Convert a genePred to psl, assuming perfect concordance between target & query. + * If qSize is 0 then the number of aligned bases will be used as qSize. */ +{ +int e = 0, aliSize = 0; +for (e = 0; e < gp->exonCount; ++e) + aliSize += (gp->exonEnds[e] - gp->exonStarts[e]); +struct psl *psl = pslNew(gp->name, qSize ? qSize : aliSize, 0, aliSize, + gp->chrom, chromSize, gp->txStart, gp->txEnd, + gp->strand, gp->exonCount, 0); +// If qSize is greater than aliSize then we assume the extra bases are at the end of +// the transcript (poly-A tail). If the alignment is on the '-' strand, then we need +// to offset the reversed qStarts by the number of extra bases. +int sizeAdjust = (gp->strand[0] == '-' && qSize > aliSize) ? (qSize - aliSize) : 0; +int i = -1; +for (e = 0; e < gp->exonCount; ++e) + { + if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1]) + { + i++; + psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]); + psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1]; +//#*** TODO: use exonFrame to detect ribosomal frameshift that makes us skip a base on t + psl->tStarts[i] = gp->exonStarts[e]; + } + else + { + // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure + psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]); + } + } +psl->blockCount = i+1; +psl->match = aliSize; +psl->tNumInsert = psl->blockCount-1; +psl->tBaseInsert = (gp->txEnd - gp->txStart) - aliSize; +return psl; +}