0c0aa501623b294e8872355063667528ac809088 braney Mon Jul 23 13:48:23 2012 -0700 add some code to pslToBed to allow CDS to be back filled after a lift of the PSL file. This is for lifting genePred's between species. (#7045) diff --git src/hg/pslToBed/pslToBed.c src/hg/pslToBed/pslToBed.c index 31c0cf4..a7af53c 100644 --- src/hg/pslToBed/pslToBed.c +++ src/hg/pslToBed/pslToBed.c @@ -1,49 +1,133 @@ /* pslToBed -- tranform a psl format file to a bed format file */ #include "common.h" #include "psl.h" #include "bed.h" #include "options.h" -static struct optionSpec optionSpecs[] = -/* option for pslToBed */ -{ - {NULL, 0} -}; void usage() /* print usage infomation and exit */ { errAbort("pslToBed: tranform a psl format file to a bed format file.\n" "usage:\n" - " pslToBed psl bed\n"); + " pslToBed psl bed\n" + "options:\n" + " -cds=cdsFile\n" + "cdsFile specifies a input cds tab-separated file which contains\n" + "genbank-style CDS records showing cdsStart..cdsEnd\n" + "e.g. NM_123456 34..305\n" + "These coordinates are assumed to be in the query coordinate system\n" + "of the psl, like those that are created from genePredToFakePsl\n"); + +} + +static struct optionSpec options[] = { + {"cds", OPTION_STRING}, + {NULL, 0}, +}; + +struct cds +{ +int start, end; // cds start and end +}; + +static void setThick(struct psl *psl, struct bed *bed, struct cds *cds) +// set thickStart and thickEnd based on CDS record +{ +int ii; +int thickStart, thickEnd; + +thickStart = thickEnd = 0; +for(ii=0; ii < psl->blockCount; ii++) + { + if (psl->qStarts[ii] + psl->blockSizes[ii] > cds->start - 1) + { + int offset = cds->start - psl->qStarts[ii]; + if (offset < 0) + offset = 0; + thickEnd = thickStart = psl->tStarts[ii] + offset - 1; + break; + } + } + +for(; ii < psl->blockCount; ii++) + { + if (psl->qStarts[ii] + psl->blockSizes[ii] >= cds->end) + { + int offset = cds->end - psl->qStarts[ii]; + if (offset < 0) + offset = 0; + thickEnd = psl->tStarts[ii] + offset; + break; + } + } +if (ii >= psl->blockCount) + { + thickEnd = psl->tStarts[ii - 1] + psl->blockSizes[ii - 1]; + } +bed->thickStart = thickStart; +bed->thickEnd = thickEnd; } -void pslToBed(char *pslFile, char *bedFile) +void pslToBed(char *pslFile, char *bedFile, struct hash *cdsHash) /* pslToBed -- tranform a psl format file to a bed format file */ { struct lineFile *pslLf = pslFileOpen(pslFile); FILE *bedFh = mustOpen(bedFile, "w"); struct psl *psl; while ((psl = pslNext(pslLf)) != NULL) { struct bed *bed = bedFromPsl(psl); + if (cdsHash) + { + struct cds *cds = hashFindVal(cdsHash, psl->qName); + if (cds != NULL) + setThick(psl, bed, cds); + } bedTabOutN(bed, 12, bedFh); bedFree(&bed); pslFree(&psl); } carefulClose(&bedFh); lineFileClose(&pslLf); } +struct hash *getCdsHash(char *cdsFile) +/* read CDS start and end for list of id's */ +{ +struct lineFile *lf = lineFileOpen(cdsFile, TRUE); +char *row[2]; +struct hash *hash = newHash(0); +while (lineFileRow(lf, row)) + { + // lines are of the form of <start>..<end> + struct cds *cds; + AllocVar(cds); + cds->start = atoi(row[1]); + char *ptr = strchr(row[1], '.'); // point to first '.' + ptr+=2; // step past two '.'s + cds->end = atoi(ptr); + hashAdd(hash, row[0], cds); + } +lineFileClose(&lf); +return hash; +} + int main(int argc, char* argv[]) { -optionInit(&argc, argv, optionSpecs); +optionInit(&argc, argv, options); if (argc != 3) usage(); -pslToBed(argv[1], argv[2]); +char *cdsFile = optionVal("cds", NULL); +struct hash *cdsHash = NULL; + +if (cdsFile != NULL) + cdsHash = getCdsHash(cdsFile); + +pslToBed(argv[1], argv[2], cdsHash); return 0; }