5057da35c701d94acdf094a57613921b87058f62 markd Sat Mar 10 08:46:20 2012 -0800 added option to include coordinates in fasta header diff --git src/hg/getRnaPred/getRnaPred.c src/hg/getRnaPred/getRnaPred.c index c450950..ab7413d 100644 --- src/hg/getRnaPred/getRnaPred.c +++ src/hg/getRnaPred/getRnaPred.c @@ -29,62 +29,66 @@ " acc start end\n" " where start..end are genbank style, one-based coordinates\n" " -keepMasking - un/masked in upper/lower case.\n" " -pslOut=psl - output a PSLs for the virtual mRNAs. Allows virtual\n" " mRNA to be analyzed by tools that work on PSLs\n" " -suffix=suf - append suffix to each id to avoid confusion with mRNAs\n" " use to define the genes.\n" " -peptides - out the translation of the CDS to a peptide sequence.\n" " -exonIndices - output indices of exon boundaries after sequence name,\n" " e.g., \"103 243 290\" says positions 1-103 are from the first exon,\n" " positions 104-243 are from the second exon, etc. \n" " -maxSize=size - output a maximum of size characters. Useful when\n" " testing gene predictions by RT-PCR.\n" " -genomeSeqs=spec - get genome sequences from the specified nib directory\n" " or 2bit file instead of going though the path found in chromInfo.\n" + " -includeCoords - include the genomic coordinates as a comment in the\n" + " fasta header. This is necessary when there are multiple genePreds\n" + " with the same name.\n" " -genePredExt - (for use with -peptides) use extended genePred format,\n" " and consider frame information when translating (Warning: only\n" " considers offset at 5' end, not frameshifts between blocks)\n" #if 0 /* Not implemented, not sure it's worth the complexity */ "If frame\n" " is in genePred and blocks don't have contiguous frame, it will output a '*'\n" " where the frameshift occured and continue to translated based on the frame\n" " specification.\n" #endif ); } static struct optionSpec options[] = { {"weird", OPTION_BOOLEAN}, {"cdsUpper", OPTION_BOOLEAN}, {"cdsOut", OPTION_STRING}, {"cdsOnly", OPTION_BOOLEAN}, {"keepMasking", OPTION_BOOLEAN}, {"pslOut", OPTION_STRING}, {"suffix", OPTION_STRING}, {"peptides", OPTION_BOOLEAN}, + {"includeCoords", OPTION_BOOLEAN}, {"exonIndices", OPTION_BOOLEAN}, {"maxSize", OPTION_INT}, {"genePredExt", OPTION_BOOLEAN}, {"genomeSeqs", OPTION_STRING}, {NULL, 0} }; /* parsed from command line */ -boolean weird, cdsUpper, cdsOnly, peptides, keepMasking, exonIndices, +boolean weird, cdsUpper, cdsOnly, peptides, keepMasking, includeCoords, exonIndices, genePredExt; char *cdsOut = NULL; char *pslOut = NULL; char *suffix = ""; int maxSize = -1; char *genomeSeqs = NULL; struct nibTwoCache *getNibTwoCacheFromDb(char *db) /* get the nib or two-bit cache from database */ { struct sqlConnection *conn = hAllocConn(db); char nibTwoPath[PATH_LEN]; struct nibTwoCache *nibTwoCache; /* grab the first chromsome file name, if it's a nib, convert to @@ -274,45 +278,46 @@ /* just overwrite the buffer with the peptide, which will stop at end of DNA * if no stop codon. Buffer size must allow for stop codon. */ int ir, ip; boolean isChrM = sameString(gp->chrom, "chrM"); for (ir = offset, ip = 0; ir < cdsBuf->stringSize; ir += 3, ip++) { cdsBuf->string[ip] = (isChrM ? lookupMitoCodon(cdsBuf->string+ir) : lookupCodon(cdsBuf->string+ir)); } cdsBuf->string[ip] = '\0'; faWriteNext(faFh, name, cdsBuf->string, strlen(cdsBuf->string)); } void processGenePred(char *db, struct genePred *gp, struct dyString *dnaBuf, - struct dyString *cdsBuf, struct dyString *indBuf, + struct dyString *cdsBuf, struct dyString *nameBuf, FILE* faFh, FILE* cdsFh, FILE* pslFh) /* output genePred DNA, check for weird splice sites if requested */ { int i; -char name[1024]; int index = 0; /* Load exons one by one into dna string. */ dyStringClear(dnaBuf); -if (exonIndices) - { - dyStringClear(indBuf); - dyStringPrintf(indBuf, " %s", db); /* we'll also include the db */ - } +dyStringClear(nameBuf); +dyStringAppend(nameBuf, gp->name); +dyStringAppend(nameBuf, suffix); +if (exonIndices || includeCoords) + dyStringPrintf(nameBuf, " %s", db); /* we'll also include the db */ +if (includeCoords) + dyStringPrintf(nameBuf, " %s:%d-%d", gp->chrom, gp->txStart, gp->txEnd); for (i=0; iexonCount; ++i) { int start = gp->exonStarts[i]; int end = gp->exonEnds[i]; int size = end - start; if (size < 0) warn("%d sized exon in %s\n", size, gp->name); else { struct dnaSeq *seq = fetchDna(db, gp->chrom, start, end, (keepMasking ? dnaMixed : dnaLower)); dyStringAppendN(dnaBuf, seq->dna, size); freeDnaSeq(&seq); } } @@ -320,170 +325,169 @@ if (gp->strand[0] == '-') reverseComplement(dnaBuf->string, dnaBuf->stringSize); /* create list of exon indices, if necessary */ if (exonIndices) { for (i = (gp->strand[0] == '-' ? gp->exonCount-1 : 0); i >= 0 && i < gp->exonCount; i += (gp->strand[0] == '-' ? -1 : 1)) /* use forward order if plus strand and reverse order if minus strand */ { index += gp->exonEnds[i] - gp->exonStarts[i]; if (maxSize != -1 && index > maxSize) index = maxSize; - dyStringPrintf(indBuf, " %d", index); + dyStringPrintf(nameBuf, " %d", index); if (index == maxSize) break; /* can only happen if maxSize != -1 */ } } if ((gp->cdsStart < gp->cdsEnd) && (cdsUpper || cdsOnly || peptides || (cdsFh != NULL))) processCds(gp, dnaBuf, cdsBuf, cdsFh); -safef(name, sizeof(name), "%s%s%s", gp->name, suffix, - exonIndices ? indBuf->string : ""); if (cdsOnly) - faWriteNext(faFh, name, cdsBuf->string, + faWriteNext(faFh, nameBuf->string, cdsBuf->string, (maxSize != -1 && cdsBuf->stringSize > maxSize) ? maxSize : cdsBuf->stringSize); else if (peptides) - outputPeptide(gp, name, cdsBuf, faFh); + outputPeptide(gp, nameBuf->string, cdsBuf, faFh); else - faWriteNext(faFh, name, dnaBuf->string, + faWriteNext(faFh, nameBuf->string, dnaBuf->string, (maxSize != -1 && dnaBuf->stringSize > maxSize) ? maxSize : dnaBuf->stringSize); if (pslFh != NULL) writePsl(db, gp, pslFh); } void getRnaForTable(char *db, char *table, char *chrom, struct dyString *dnaBuf, - struct dyString *cdsBuf, struct dyString *indBuf, + struct dyString *cdsBuf, struct dyString *nameBuf, FILE *faFh, FILE *cdsFh, FILE* pslFh) /* get RNA for a genePred table */ { int rowOffset; char **row; struct sqlConnection *conn = hAllocConn(db); struct sqlResult *sr; if (!sqlTableExists(conn, table)) errAbort("table or file \"%s\" does not exists", table); sr = hChromQuery(conn, table, chrom, NULL, &rowOffset); while ((row = sqlNextRow(sr)) != NULL) { /* Load gene prediction from database. */ struct genePred *gp; if (genePredExt) gp = genePredExtLoad(row+rowOffset, GENEPREDX_NUM_COLS); else gp = genePredLoad(row+rowOffset); if ((!weird) || hasWeirdSplice(db, gp)) - processGenePred(db, gp, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh); + processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh); genePredFree(&gp); } sqlFreeResult(&sr); hFreeConn(&conn); } void getRnaForTables(char *db, char *table, char *chrom, struct dyString *dnaBuf, - struct dyString *cdsBuf, struct dyString *indBuf, + struct dyString *cdsBuf, struct dyString *nameBuf, FILE *faFh, FILE *cdsFh, FILE* pslFh) /* get RNA for one for possibly splite genePred table */ { struct slName* chroms = NULL, *chr; if (sameString(chrom, "all")) chroms = hAllChromNames(db); else chroms = slNameNew(chrom); for (chr = chroms; chr != NULL; chr = chr->next) - getRnaForTable(db, table, chr->name, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh); + getRnaForTable(db, table, chr->name, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh); } void getRnaForFile(char *db, char *table, char *chrom, struct dyString *dnaBuf, - struct dyString *cdsBuf, struct dyString *indBuf, + struct dyString *cdsBuf, struct dyString *nameBuf, FILE *faFh, FILE* cdsFh, FILE* pslFh) /* get RNA for a genePred file */ { boolean all = sameString(chrom, "all"); struct lineFile *lf = lineFileOpen(table, TRUE); char *row[GENEPREDX_NUM_COLS]; while (lineFileNextRowTab(lf, row, genePredExt ? GENEPREDX_NUM_COLS : GENEPRED_NUM_COLS)) { struct genePred *gp; if (genePredExt) gp = genePredExtLoad(row, GENEPREDX_NUM_COLS); else gp = genePredLoad(row); if (all || sameString(gp->chrom, chrom)) - processGenePred(db, gp, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh); + processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh); genePredFree(&gp); } lineFileClose(&lf); } void getRnaPred(char *db, char *table, char *chrom, char *faOut) /* getRna - Get RNA for gene predictions and write to file. */ { struct dyString *dnaBuf = dyStringNew(16*1024); struct dyString *cdsBuf = NULL; -struct dyString *indBuf = NULL; +struct dyString *nameBuf = dyStringNew(64); FILE *faFh = mustOpen(faOut, "w"); FILE *cdsFh = NULL; FILE *pslFh = NULL; if (cdsOnly || peptides) cdsBuf = dyStringNew(16*1024); dyStringNew(16*1024); if (cdsOut != NULL) cdsFh = mustOpen(cdsOut, "w"); if (pslOut != NULL) pslFh = mustOpen(pslOut, "w"); if (exonIndices) - indBuf = dyStringNew(512); + nameBuf = dyStringNew(512); if (fileExists(table)) - getRnaForFile(db, table, chrom, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh); + getRnaForFile(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh); else - getRnaForTables(db, table, chrom, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh); + getRnaForTables(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh); dyStringFree(&dnaBuf); dyStringFree(&cdsBuf); -dyStringFree(&indBuf); +dyStringFree(&nameBuf); carefulClose(&pslFh); carefulClose(&cdsFh); carefulClose(&faFh); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 5) usage(); dnaUtilOpen(); weird = optionExists("weird"); cdsUpper = optionExists("cdsUpper"); cdsOnly = optionExists("cdsOnly"); cdsOut = optionVal("cdsOut", NULL); keepMasking = optionExists("keepMasking"); pslOut = optionVal("pslOut", NULL); suffix = optionVal("suffix", suffix); peptides = optionExists("peptides"); +includeCoords = optionExists("includeCoords"); exonIndices = optionExists("exonIndices"); maxSize = optionInt("maxSize", -1); genePredExt = optionExists("genePredExt"); if (cdsOnly && peptides) errAbort("can't specify both -cdsOnly and -peptides"); if (cdsUpper && keepMasking) errAbort("can't specify both -cdsUpper and -keepMasking"); if (exonIndices && (cdsOnly || peptides)) errAbort("can't specify -exonIndices with -cdsOnly or -peptides"); if (maxSize != -1 && peptides) errAbort("can't specify -maxSize with -peptides"); genomeSeqs = optionVal("genomeSeqs", NULL); getRnaPred(argv[1], argv[2], argv[3], argv[4]); return 0; }