d8352b0a8c16641c5af4f33112b4f241af931771 braney Wed Jul 25 16:00:15 2012 -0700 add some tests, make the code easier to understand, and fix a problem with no CDS being indicated by thickStart == chromStart && thickEnd == chromEnd, instead of thickStart==thickEnd==0 as should have been the case. Thanks Tim for the code review!!! diff --git src/hg/pslToBed/pslToBed.c src/hg/pslToBed/pslToBed.c index a7af53c..0e25ea1 100644 --- src/hg/pslToBed/pslToBed.c +++ src/hg/pslToBed/pslToBed.c @@ -1,133 +1,145 @@ /* pslToBed -- tranform a psl format file to a bed format file */ #include "common.h" #include "psl.h" #include "bed.h" #include "options.h" void usage() /* print usage infomation and exit */ { errAbort("pslToBed: tranform a psl format file to a bed format file.\n" "usage:\n" " pslToBed psl bed\n" "options:\n" " -cds=cdsFile\n" "cdsFile specifies a input cds tab-separated file which contains\n" "genbank-style CDS records showing cdsStart..cdsEnd\n" "e.g. NM_123456 34..305\n" "These coordinates are assumed to be in the query coordinate system\n" "of the psl, like those that are created from genePredToFakePsl\n"); } static struct optionSpec options[] = { {"cds", OPTION_STRING}, {NULL, 0}, }; struct cds { int start, end; // cds start and end }; -static void setThick(struct psl *psl, struct bed *bed, struct cds *cds) -// set thickStart and thickEnd based on CDS record +static unsigned getTargetForQuery(struct psl *psl, int queryAddress) +// get target address for query address from PSL { -int ii; -int thickStart, thickEnd; +int blockNum; -thickStart = thickEnd = 0; -for(ii=0; ii < psl->blockCount; ii++) +for (blockNum=0; blockNum < psl->blockCount; blockNum++) { - if (psl->qStarts[ii] + psl->blockSizes[ii] > cds->start - 1) + if (psl->qStarts[blockNum] > queryAddress) { - int offset = cds->start - psl->qStarts[ii]; - if (offset < 0) - offset = 0; - thickEnd = thickStart = psl->tStarts[ii] + offset - 1; - break; - } + // this block starts past the queryAddress + // just point to the beginning of this block + return psl->tStarts[blockNum]; } - -for(; ii < psl->blockCount; ii++) - { - if (psl->qStarts[ii] + psl->blockSizes[ii] >= cds->end) + else if (psl->qStarts[blockNum] + psl->blockSizes[blockNum] > queryAddress) { - int offset = cds->end - psl->qStarts[ii]; - if (offset < 0) - offset = 0; - thickEnd = psl->tStarts[ii] + offset; - break; + // since block addresses are always increasing we know if the end + // of the block is beyond our address that the address is + // in this block + unsigned offsetInBlock = queryAddress - psl->qStarts[blockNum]; + + assert(offsetInBlock < psl->blockSizes[blockNum]); + + return psl->tStarts[blockNum] + offsetInBlock; } } -if (ii >= psl->blockCount) - { - thickEnd = psl->tStarts[ii - 1] + psl->blockSizes[ii - 1]; + +// we don't have any blocks with this query in it, just point +// to the end +return psl->tEnd; } + +static void setThick(struct psl *psl, struct bed *bed, struct cds *cds) +// set thickStart and thickEnd based on CDS record +{ +int thickStart, thickEnd; + +// we subtract one from start to convert to PSL coordinate system +thickStart = getTargetForQuery(psl, cds->start - 1); +thickEnd = getTargetForQuery(psl, cds->end); + +// if thickStart equals thickEnd, then there is no CDS +if (thickStart == thickEnd) + thickStart = thickEnd = 0; + bed->thickStart = thickStart; bed->thickEnd = thickEnd; } void pslToBed(char *pslFile, char *bedFile, struct hash *cdsHash) /* pslToBed -- tranform a psl format file to a bed format file */ { struct lineFile *pslLf = pslFileOpen(pslFile); FILE *bedFh = mustOpen(bedFile, "w"); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { struct bed *bed = bedFromPsl(psl); if (cdsHash) { struct cds *cds = hashFindVal(cdsHash, psl->qName); - if (cds != NULL) + if (cds == NULL) + bed->thickStart = bed->thickEnd = 0; + else setThick(psl, bed, cds); } bedTabOutN(bed, 12, bedFh); bedFree(&bed); pslFree(&psl); } carefulClose(&bedFh); lineFileClose(&pslLf); } struct hash *getCdsHash(char *cdsFile) /* read CDS start and end for list of id's */ { struct lineFile *lf = lineFileOpen(cdsFile, TRUE); char *row[2]; struct hash *hash = newHash(0); while (lineFileRow(lf, row)) { // lines are of the form of .. struct cds *cds; AllocVar(cds); cds->start = atoi(row[1]); char *ptr = strchr(row[1], '.'); // point to first '.' ptr+=2; // step past two '.'s cds->end = atoi(ptr); hashAdd(hash, row[0], cds); } lineFileClose(&lf); return hash; } int main(int argc, char* argv[]) { optionInit(&argc, argv, options); if (argc != 3) usage(); char *cdsFile = optionVal("cds", NULL); struct hash *cdsHash = NULL; if (cdsFile != NULL) cdsHash = getCdsHash(cdsFile); pslToBed(argv[1], argv[2], cdsHash); return 0; }