b642fe3045bbc0df86e56957d61d7851fc5a127c hiram Tue Aug 30 14:10:06 2016 -0700 add qSizes argument so the qSizes can be adjusted to their true RNA size refs #13673 diff --git src/hg/genePredToFakePsl/genePredToFakePsl.c src/hg/genePredToFakePsl/genePredToFakePsl.c index 346f50e..8b38107 100644 --- src/hg/genePredToFakePsl/genePredToFakePsl.c +++ src/hg/genePredToFakePsl/genePredToFakePsl.c @@ -1,58 +1,63 @@ /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "options.h" #include "portable.h" #include "hash.h" #include "hdb.h" #include "genePred.h" #include "genePredReader.h" #include "psl.h" /* Command line switches. */ -char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ +static char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ +static char *qSizes = NULL; /* read query sizes from file */ +static struct hash *qSizeHash = NULL; /* key is name, value is size */ /* Command line option specifications */ static struct optionSpec optionSpecs[] = { {"chromSize", OPTION_STRING}, + {"qSizes", OPTION_STRING}, {NULL, 0} }; void usage() /* Explain usage and exit. */ { errAbort( "genePredToFakePsl - Create a psl of fake-mRNA aligned to gene-preds from a file or table.\n" "usage:\n" " genePredToFakePsl [options] db fileTbl pslOut cdsOut\n" "\n" "If fileTbl is an existing file, then it is used.\n" "Otherwise, the table by this name is used.\n" "\n" "pslOut specifies the fake-mRNA output psl filename.\n" "\n" "cdsOut specifies the output cds tab-separated file which contains\n" "genbank-style CDS records showing cdsStart..cdsEnd\n" "e.g. NM_123456 34..305\n" "options:\n" " -chromSize=sizefile\tRead chrom sizes from file instead of database\n" " sizefile contains two white space separated fields per line:\n" " chrom name and size\n" + " -qSizes=qSizesFile\tRead in query sizes to fixup qSize and qStarts\n" + "\n"); } static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh) /* determine CDS and output */ { /* * Warning: Genbank CDS does't have the ability to represent * partial codons. If we have genePreds created from GFF/GTF, they can have * partial codons, which is indicated in frame. This code does not correctly handle * this case, or frame shifting indels. */ int e, off = 0; int qCdsStart = -1, qCdsEnd = -1; int eCdsStart, eCdsEnd; @@ -68,40 +73,53 @@ off += gp->exonEnds[e] - gp->exonStarts[e]; } if (gp->strand[0] == '-') reverseIntRange(&qCdsStart, &qCdsEnd, qSize); fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */ } static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh) /* convert a genePred to a psl and CDS */ { int chromSize = hashIntValDefault(chromHash, gp->chrom, 0); if (chromSize == 0) errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom); int e = 0, qSize=0; + for (e = 0; e < gp->exonCount; ++e) qSize+=(gp->exonEnds[e] - gp->exonStarts[e]); -struct psl *psl = pslNew(gp->name, qSize, 0, qSize, + +int sizeAdjust = 0; +if (qSizes != NULL) + { + int realSize = hashIntValDefault(qSizeHash, gp->name, 0); + if (realSize > 0) + sizeAdjust = realSize - qSize; + } + +struct psl *psl = pslNew(gp->name, qSize+sizeAdjust, 0, qSize, gp->chrom, chromSize, gp->txStart, gp->txEnd, gp->strand, gp->exonCount, 0); +/* no size adjustment to qStarts for positive strand */ +if (gp->strand[0] == '+') + sizeAdjust = 0; psl->blockCount = gp->exonCount; for (e = 0; e < gp->exonCount; ++e) { psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]); - psl->qStarts[e] = e==0 ? 0 : psl->qStarts[e-1] + psl->blockSizes[e-1]; + psl->qStarts[e] = e==0 ? 0 + sizeAdjust : psl->qStarts[e-1] + psl->blockSizes[e-1]; psl->tStarts[e] = gp->exonStarts[e]; } psl->match = qSize; psl->tNumInsert = psl->blockCount-1; psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize; pslTabOut(psl, pslFh); pslFree(&psl); if (gp->cdsStart < gp->cdsEnd) cnvGenePredCds(gp, qSize, cdsFh); } static struct hash *getChromHash(char *db) /* Return a hash of chrom names and sizes, from either -chromSize=file or db */ { struct hash *chromHash = NULL; @@ -135,20 +153,23 @@ while ((gp = genePredReaderNext(gpr)) != NULL) { cnvGenePred(chromHash, gp, pslFh, cdsFh); } genePredReaderFree(&gpr); carefulClose(&pslFh); carefulClose(&cdsFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); chromSizes = optionVal("chromSize", NULL); +qSizes = optionVal("qSizes", NULL); if (argc != 5) usage(); +if (qSizes != NULL) + qSizeHash = hChromSizeHashFromFile(qSizes); fakePslFromGenePred(argv[1],argv[2],argv[3],argv[4]); return 0; }