bc3bbb2794e3067f0ff35ac2003fd69362e2ed58 angie Thu Mar 8 11:34:17 2018 -0800 Libified genePredToFakePsl's code for extracting PSL and CDS from genePred so I can use it to infer PSL+CDS from genePred tracks like Gencode for HGVS. refs #21076 diff --git src/hg/genePredToFakePsl/genePredToFakePsl.c src/hg/genePredToFakePsl/genePredToFakePsl.c index a4e37b1..9fc57a2 100644 --- src/hg/genePredToFakePsl/genePredToFakePsl.c +++ src/hg/genePredToFakePsl/genePredToFakePsl.c @@ -1,24 +1,25 @@ /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "options.h" #include "portable.h" #include "hash.h" #include "hdb.h" +#include "genbank.h" #include "genePred.h" #include "genePredReader.h" #include "psl.h" /* Command line switches. */ static char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ static char *qSizes = NULL; /* read query sizes from file */ static struct hash *qSizeHash = NULL; /* key is name, value is size */ /* Command line option specifications */ static struct optionSpec optionSpecs[] = { {"chromSize", OPTION_STRING}, {"qSizes", OPTION_STRING}, {NULL, 0} @@ -40,105 +41,45 @@ "cdsOut specifies the output cds tab-separated file which contains\n" "genbank-style CDS records showing cdsStart..cdsEnd\n" "e.g. NM_123456 34..305\n" "options:\n" " -chromSize=sizefile\tRead chrom sizes from file instead of database\n" " sizefile contains two white space separated fields per line:\n" " chrom name and size\n" " -qSizes=qSizesFile\tRead in query sizes to fixup qSize and qStarts\n" "\n"); } static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh) /* determine CDS and output */ { -/* - * Warning: Genbank CDS does't have the ability to represent - * partial codons. If we have genePreds created from GFF/GTF, they can have - * partial codons, which is indicated in frame. This code does not correctly handle - * this case, or frame shifting indels. - */ -int e, off = 0; -int qCdsStart = -1, qCdsEnd = -1; -int eCdsStart, eCdsEnd; - -for (e = 0; e < gp->exonCount; ++e) - { - if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd)) - { - if (qCdsStart < 0) - qCdsStart = off + (eCdsStart - gp->exonStarts[e]); - qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]); - } - off += gp->exonEnds[e] - gp->exonStarts[e]; - } -if (gp->strand[0] == '-') - reverseIntRange(&qCdsStart, &qCdsEnd, qSize); -fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */ +struct genbankCds cds; +genePredToCds(gp, &cds); +fprintf(cdsFh,"%s\t%d..%d\n", gp->name, cds.start+1, cds.end); /* genbank cds is closed 1-based */ } static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh) /* convert a genePred to a psl and CDS */ { int chromSize = hashIntValDefault(chromHash, gp->chrom, 0); if (chromSize == 0) errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom); -int e = 0, qSize=0; - - -for (e = 0; e < gp->exonCount; ++e) - qSize+=(gp->exonEnds[e] - gp->exonStarts[e]); - -int realSize = 0; -int sizeAdjust = 0; +int qSize = 0; if (qSizes != NULL) - { - realSize = hashIntValDefault(qSizeHash, gp->name, 0); - // If there is a realSize (>0), but realSize is less than qSize, this subtraction would - // cause unsigned underflow so don't do it. qSize > realSize implies that one of the genePred - // "exons" is glossing over a query gap. - if (realSize > qSize) - sizeAdjust = realSize - qSize; - } - -struct psl *psl = pslNew(gp->name, realSize ? realSize : qSize, 0, qSize, - gp->chrom, chromSize, gp->txStart, gp->txEnd, - gp->strand, gp->exonCount, 0); -/* no size adjustment to qStarts for positive strand */ -if (gp->strand[0] == '+') - sizeAdjust = 0; -int i = -1; -for (e = 0; e < gp->exonCount; ++e) - { - if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1]) - { - i++; - psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]); - psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1]; - psl->tStarts[i] = gp->exonStarts[e]; - } - else - { - // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure - psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]); - } - } -psl->blockCount = i+1; -psl->match = qSize; -psl->tNumInsert = psl->blockCount-1; -psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize; + qSize = hashIntValDefault(qSizeHash, gp->name, 0); +struct psl *psl = genePredToPsl(gp, chromSize, qSize); pslTabOut(psl, pslFh); pslFree(&psl); if (gp->cdsStart < gp->cdsEnd) cnvGenePredCds(gp, qSize, cdsFh); } static struct hash *getChromHash(char *db) /* Return a hash of chrom names and sizes, from either -chromSize=file or db */ { struct hash *chromHash = NULL; if (chromSizes != NULL) chromHash = hChromSizeHashFromFile(chromSizes); else chromHash = hChromSizeHash(db); return chromHash;