61917474c6c267cd21f007c1f2d5b5f243ee7d5f angie Mon Jun 19 09:25:58 2017 -0700 Prevent unsigned int underflow when sum of blockSizes is greater than actual qSize. Collapse contiguous 'exons' that are split to represent internal frameshift (e.g. ribosomal frameshift) in genePredExt but need to be a single block to avoid pslCheck errors downstream. This prevents a few PSLs from being lost from ncbiRefSeqPsl. diff --git src/hg/genePredToFakePsl/genePredToFakePsl.c src/hg/genePredToFakePsl/genePredToFakePsl.c index 8b38107..a4e37b1 100644 --- src/hg/genePredToFakePsl/genePredToFakePsl.c +++ src/hg/genePredToFakePsl/genePredToFakePsl.c @@ -1,175 +1,189 @@ /* genePredToFakePsl - create fake .psl of mRNA aligned to dna from genePred file or table. */ /* Copyright (C) 2013 The Regents of the University of California * See README in this or parent directory for licensing information. */ #include "common.h" #include "options.h" #include "portable.h" #include "hash.h" #include "hdb.h" #include "genePred.h" #include "genePredReader.h" #include "psl.h" /* Command line switches. */ static char *chromSizes = NULL; /* read chrom sizes from file instead of database . */ static char *qSizes = NULL; /* read query sizes from file */ static struct hash *qSizeHash = NULL; /* key is name, value is size */ /* Command line option specifications */ static struct optionSpec optionSpecs[] = { {"chromSize", OPTION_STRING}, {"qSizes", OPTION_STRING}, {NULL, 0} }; void usage() /* Explain usage and exit. */ { errAbort( "genePredToFakePsl - Create a psl of fake-mRNA aligned to gene-preds from a file or table.\n" "usage:\n" " genePredToFakePsl [options] db fileTbl pslOut cdsOut\n" "\n" "If fileTbl is an existing file, then it is used.\n" "Otherwise, the table by this name is used.\n" "\n" "pslOut specifies the fake-mRNA output psl filename.\n" "\n" "cdsOut specifies the output cds tab-separated file which contains\n" "genbank-style CDS records showing cdsStart..cdsEnd\n" "e.g. NM_123456 34..305\n" "options:\n" " -chromSize=sizefile\tRead chrom sizes from file instead of database\n" " sizefile contains two white space separated fields per line:\n" " chrom name and size\n" " -qSizes=qSizesFile\tRead in query sizes to fixup qSize and qStarts\n" "\n"); } static void cnvGenePredCds(struct genePred *gp, int qSize, FILE *cdsFh) /* determine CDS and output */ { /* * Warning: Genbank CDS does't have the ability to represent * partial codons. If we have genePreds created from GFF/GTF, they can have * partial codons, which is indicated in frame. This code does not correctly handle * this case, or frame shifting indels. */ int e, off = 0; int qCdsStart = -1, qCdsEnd = -1; int eCdsStart, eCdsEnd; for (e = 0; e < gp->exonCount; ++e) { if (genePredCdsExon(gp, e, &eCdsStart, &eCdsEnd)) { if (qCdsStart < 0) qCdsStart = off + (eCdsStart - gp->exonStarts[e]); qCdsEnd = off + (eCdsEnd - gp->exonStarts[e]); } off += gp->exonEnds[e] - gp->exonStarts[e]; } if (gp->strand[0] == '-') reverseIntRange(&qCdsStart, &qCdsEnd, qSize); fprintf(cdsFh,"%s\t%d..%d\n", gp->name, qCdsStart+1, qCdsEnd); /* genbank cds is closed 1-based */ } static void cnvGenePred(struct hash *chromHash, struct genePred *gp, FILE *pslFh, FILE *cdsFh) /* convert a genePred to a psl and CDS */ { int chromSize = hashIntValDefault(chromHash, gp->chrom, 0); if (chromSize == 0) errAbort("Couldn't find chromosome/scaffold '%s' in chromInfo", gp->chrom); int e = 0, qSize=0; for (e = 0; e < gp->exonCount; ++e) qSize+=(gp->exonEnds[e] - gp->exonStarts[e]); +int realSize = 0; int sizeAdjust = 0; if (qSizes != NULL) { - int realSize = hashIntValDefault(qSizeHash, gp->name, 0); - if (realSize > 0) + realSize = hashIntValDefault(qSizeHash, gp->name, 0); + // If there is a realSize (>0), but realSize is less than qSize, this subtraction would + // cause unsigned underflow so don't do it. qSize > realSize implies that one of the genePred + // "exons" is glossing over a query gap. + if (realSize > qSize) sizeAdjust = realSize - qSize; } -struct psl *psl = pslNew(gp->name, qSize+sizeAdjust, 0, qSize, +struct psl *psl = pslNew(gp->name, realSize ? realSize : qSize, 0, qSize, gp->chrom, chromSize, gp->txStart, gp->txEnd, gp->strand, gp->exonCount, 0); /* no size adjustment to qStarts for positive strand */ if (gp->strand[0] == '+') sizeAdjust = 0; -psl->blockCount = gp->exonCount; +int i = -1; for (e = 0; e < gp->exonCount; ++e) { - psl->blockSizes[e] = (gp->exonEnds[e] - gp->exonStarts[e]); - psl->qStarts[e] = e==0 ? 0 + sizeAdjust : psl->qStarts[e-1] + psl->blockSizes[e-1]; - psl->tStarts[e] = gp->exonStarts[e]; + if (e == 0 || gp->exonStarts[e] != gp->exonEnds[e-1]) + { + i++; + psl->blockSizes[i] = (gp->exonEnds[e] - gp->exonStarts[e]); + psl->qStarts[i] = i==0 ? 0 + sizeAdjust : psl->qStarts[i-1] + psl->blockSizes[i-1]; + psl->tStarts[i] = gp->exonStarts[e]; + } + else + { + // Merge "exons" that have a 0-length gap between them to avoid pslCheck failure + psl->blockSizes[i] += (gp->exonEnds[e] - gp->exonStarts[e]); + } } +psl->blockCount = i+1; psl->match = qSize; psl->tNumInsert = psl->blockCount-1; psl->tBaseInsert = (gp->txEnd - gp->txStart) - qSize; pslTabOut(psl, pslFh); pslFree(&psl); if (gp->cdsStart < gp->cdsEnd) cnvGenePredCds(gp, qSize, cdsFh); } static struct hash *getChromHash(char *db) /* Return a hash of chrom names and sizes, from either -chromSize=file or db */ { struct hash *chromHash = NULL; if (chromSizes != NULL) chromHash = hChromSizeHashFromFile(chromSizes); else chromHash = hChromSizeHash(db); return chromHash; } static void fakePslFromGenePred(char *db, char *fileTbl, char *pslOut, char *cdsOut) /* check a genePred */ { struct genePredReader *gpr; struct genePred *gp; FILE *pslFh = mustOpen(pslOut, "w"); FILE *cdsFh = mustOpen(cdsOut, "w"); struct hash *chromHash = getChromHash(db); if (fileExists(fileTbl)) { gpr = genePredReaderFile(fileTbl, NULL); } else { struct sqlConnection *conn = hAllocConn(db); gpr = genePredReaderQuery(conn, fileTbl, NULL); hFreeConn(&conn); } while ((gp = genePredReaderNext(gpr)) != NULL) { cnvGenePred(chromHash, gp, pslFh, cdsFh); } genePredReaderFree(&gpr); carefulClose(&pslFh); carefulClose(&cdsFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, optionSpecs); chromSizes = optionVal("chromSize", NULL); qSizes = optionVal("qSizes", NULL); if (argc != 5) usage(); if (qSizes != NULL) qSizeHash = hChromSizeHashFromFile(qSizes); fakePslFromGenePred(argv[1],argv[2],argv[3],argv[4]); return 0; }