src/hg/getRnaPred/getRnaPred.c 5057da35c701d94acdf094a57613921b87058f62

5057da35c701d94acdf094a57613921b87058f62
markd
  Sat Mar 10 08:46:20 2012 -0800
added option to include coordinates in fasta header
diff --git src/hg/getRnaPred/getRnaPred.c src/hg/getRnaPred/getRnaPred.c
index c450950..ab7413d 100644
--- src/hg/getRnaPred/getRnaPred.c
+++ src/hg/getRnaPred/getRnaPred.c
@@ -29,62 +29,66 @@
   "      acc  start  end\n"
   "    where start..end are genbank style, one-based coordinates\n"
   "   -keepMasking - un/masked in upper/lower case.\n"
   "   -pslOut=psl - output a PSLs for the virtual mRNAs.  Allows virtual\n"
   "    mRNA to be analyzed by tools that work on PSLs\n"
   "   -suffix=suf - append suffix to each id to avoid confusion with mRNAs\n"
   "    use to define the genes.\n"
   "   -peptides - out the translation of the CDS to a peptide sequence.\n"
   "   -exonIndices - output indices of exon boundaries after sequence name,\n"
   "    e.g., \"103 243 290\" says positions 1-103 are from the first exon,\n"
   "    positions 104-243 are from the second exon, etc. \n"
   "   -maxSize=size - output a maximum of size characters.  Useful when\n"
   "    testing gene predictions by RT-PCR.\n"
   "   -genomeSeqs=spec - get genome sequences from the specified nib directory\n"
   "    or 2bit file instead of going though the path found in chromInfo.\n"
+  "   -includeCoords - include the genomic coordinates as a comment in the\n"
+  "    fasta header.  This is necessary when there are multiple genePreds\n"
+  "    with the same name.\n"
   "   -genePredExt - (for use with -peptides) use extended genePred format,\n"
   "    and consider frame information when translating (Warning: only\n"
   "    considers offset at 5' end, not frameshifts between blocks)\n"
 #if 0
   /* Not implemented, not sure it's worth the complexity */
   "If frame\n"
   "    is in genePred and blocks don't have contiguous frame, it will output a '*'\n"
   "    where the frameshift occured and continue to translated based on the frame\n"
   "    specification.\n"
 #endif
   );
 }
 
 static struct optionSpec options[] = {
    {"weird", OPTION_BOOLEAN},
    {"cdsUpper", OPTION_BOOLEAN},
    {"cdsOut", OPTION_STRING},
    {"cdsOnly", OPTION_BOOLEAN},
    {"keepMasking", OPTION_BOOLEAN},
    {"pslOut", OPTION_STRING},
    {"suffix", OPTION_STRING},
    {"peptides", OPTION_BOOLEAN},
+   {"includeCoords", OPTION_BOOLEAN},
    {"exonIndices", OPTION_BOOLEAN},
    {"maxSize", OPTION_INT},
    {"genePredExt", OPTION_BOOLEAN},
    {"genomeSeqs", OPTION_STRING},
    {NULL, 0}
 };
 
 
 /* parsed from command line */
-boolean weird, cdsUpper, cdsOnly, peptides, keepMasking, exonIndices, 
+boolean weird, cdsUpper, cdsOnly, peptides, keepMasking, includeCoords, exonIndices,
   genePredExt;
 char *cdsOut = NULL;
 char *pslOut = NULL;
 char *suffix = "";
 int maxSize = -1;
 char *genomeSeqs = NULL;
 
 struct nibTwoCache *getNibTwoCacheFromDb(char *db)
 /* get the nib or two-bit cache from database */
 {
 struct sqlConnection *conn = hAllocConn(db);
 char nibTwoPath[PATH_LEN];
 struct nibTwoCache *nibTwoCache;
 
 /* grab the first chromsome file name, if it's a nib, convert to
@@ -274,45 +278,46 @@
 /* just overwrite the buffer with the peptide, which will stop at end of DNA
  * if no stop codon. Buffer size must allow for stop codon. */
 int ir, ip;
 boolean isChrM = sameString(gp->chrom, "chrM");
 for (ir = offset, ip = 0; ir < cdsBuf->stringSize; ir += 3, ip++)
     {
     cdsBuf->string[ip] = (isChrM ? lookupMitoCodon(cdsBuf->string+ir)
                           : lookupCodon(cdsBuf->string+ir));
     }
 cdsBuf->string[ip] = '\0';
   
 faWriteNext(faFh, name, cdsBuf->string, strlen(cdsBuf->string));
 }
 
 void processGenePred(char *db, struct genePred *gp, struct dyString *dnaBuf, 
-                     struct dyString *cdsBuf, struct dyString *indBuf,
+                     struct dyString *cdsBuf, struct dyString *nameBuf,
                      FILE* faFh, FILE* cdsFh, FILE* pslFh)
 /* output genePred DNA, check for weird splice sites if requested */
 {
 int i;
-char name[1024];
 int index = 0;
 
 /* Load exons one by one into dna string. */
 dyStringClear(dnaBuf);
-if (exonIndices)
-    {
-    dyStringClear(indBuf);
-    dyStringPrintf(indBuf, " %s", db);  /* we'll also include the db */
-    }
+dyStringClear(nameBuf);
+dyStringAppend(nameBuf, gp->name);
+dyStringAppend(nameBuf, suffix);
+if (exonIndices || includeCoords)
+    dyStringPrintf(nameBuf, " %s", db);  /* we'll also include the db */
+if (includeCoords)
+    dyStringPrintf(nameBuf, " %s:%d-%d", gp->chrom, gp->txStart, gp->txEnd);
 for (i=0; i<gp->exonCount; ++i)
     {
     int start = gp->exonStarts[i];
     int end = gp->exonEnds[i];
     int size = end - start;
     if (size < 0)
         warn("%d sized exon in %s\n", size, gp->name);
     else
         {
         struct dnaSeq *seq = fetchDna(db, gp->chrom, start, end, (keepMasking ? dnaMixed : dnaLower));
         dyStringAppendN(dnaBuf, seq->dna, size);
         freeDnaSeq(&seq);
         }
     }
 
@@ -320,170 +325,169 @@
 if (gp->strand[0] == '-')
     reverseComplement(dnaBuf->string, dnaBuf->stringSize);
 
 /* create list of exon indices, if necessary */
 if (exonIndices) 
     {
     for (i = (gp->strand[0] == '-' ? gp->exonCount-1 : 0); 
          i >= 0 && i < gp->exonCount; 
          i += (gp->strand[0] == '-' ? -1 : 1))
                                 /* use forward order if plus strand
                                    and reverse order if minus
                                    strand */
         {
         index += gp->exonEnds[i] - gp->exonStarts[i];
         if (maxSize != -1 && index > maxSize) index = maxSize;
-        dyStringPrintf(indBuf, " %d", index);
+        dyStringPrintf(nameBuf, " %d", index);
         if (index == maxSize) break; /* can only happen if maxSize != -1 */
         }
     } 
 
 if ((gp->cdsStart < gp->cdsEnd)
     && (cdsUpper || cdsOnly || peptides || (cdsFh != NULL)))
     processCds(gp, dnaBuf, cdsBuf, cdsFh);
 
-safef(name, sizeof(name), "%s%s%s", gp->name, suffix, 
-      exonIndices ? indBuf->string : "");
 if (cdsOnly)
-    faWriteNext(faFh, name, cdsBuf->string, 
+    faWriteNext(faFh, nameBuf->string, cdsBuf->string, 
                 (maxSize != -1 && cdsBuf->stringSize > maxSize) ? maxSize : 
                 cdsBuf->stringSize);
 else if (peptides)
-    outputPeptide(gp, name, cdsBuf, faFh);
+    outputPeptide(gp, nameBuf->string, cdsBuf, faFh);
 else
-    faWriteNext(faFh, name, dnaBuf->string, 
+    faWriteNext(faFh, nameBuf->string, dnaBuf->string, 
                 (maxSize != -1 && dnaBuf->stringSize > maxSize) ? maxSize : 
                 dnaBuf->stringSize);
 
 if (pslFh != NULL)
     writePsl(db, gp, pslFh);
 }
 
 void getRnaForTable(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
-                    struct dyString *cdsBuf, struct dyString *indBuf,
+                    struct dyString *cdsBuf, struct dyString *nameBuf,
                     FILE *faFh, FILE *cdsFh, FILE* pslFh)
 /* get RNA for a genePred table */
 {
 int rowOffset;
 char **row;
 struct sqlConnection *conn = hAllocConn(db);
 struct sqlResult *sr;
 
 if (!sqlTableExists(conn, table))
     errAbort("table or file \"%s\" does not exists", table);
 
 sr = hChromQuery(conn, table, chrom, NULL, &rowOffset);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     /* Load gene prediction from database. */
     struct genePred *gp;
     if (genePredExt)
       gp = genePredExtLoad(row+rowOffset, GENEPREDX_NUM_COLS);
     else
       gp = genePredLoad(row+rowOffset);
     if ((!weird) || hasWeirdSplice(db, gp))
-        processGenePred(db, gp, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh);
+        processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
     genePredFree(&gp);
     }
 sqlFreeResult(&sr);
 hFreeConn(&conn);
 }
 
 void getRnaForTables(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
-                     struct dyString *cdsBuf, struct dyString *indBuf,
+                     struct dyString *cdsBuf, struct dyString *nameBuf,
                      FILE *faFh, FILE *cdsFh, FILE* pslFh)
 /* get RNA for one for possibly splite genePred table */
 {
 struct slName* chroms = NULL, *chr;
 if (sameString(chrom, "all"))
     chroms = hAllChromNames(db);
 else
     chroms = slNameNew(chrom);
 for (chr = chroms; chr != NULL; chr = chr->next)
-    getRnaForTable(db, table, chr->name, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh);
+    getRnaForTable(db, table, chr->name, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 }
 
 void getRnaForFile(char *db, char *table, char *chrom, struct dyString *dnaBuf, 
-                   struct dyString *cdsBuf, struct dyString *indBuf,
+                   struct dyString *cdsBuf, struct dyString *nameBuf,
                    FILE *faFh, FILE* cdsFh, FILE* pslFh)
 /* get RNA for a genePred file */
 {
 boolean all = sameString(chrom, "all");
 struct lineFile *lf = lineFileOpen(table, TRUE);
 char *row[GENEPREDX_NUM_COLS];
 while (lineFileNextRowTab(lf, row, genePredExt ? 
                           GENEPREDX_NUM_COLS : GENEPRED_NUM_COLS))
     {
     struct genePred *gp;
     if (genePredExt) 
       gp = genePredExtLoad(row, GENEPREDX_NUM_COLS);
     else 
       gp = genePredLoad(row);
     if (all || sameString(gp->chrom, chrom))
-        processGenePred(db, gp, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh);
+        processGenePred(db, gp, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
     genePredFree(&gp);
     }
 lineFileClose(&lf); 
 }
 
 void getRnaPred(char *db, char *table, char *chrom, char *faOut)
 /* getRna - Get RNA for gene predictions and write to file. */
 {
 struct dyString *dnaBuf = dyStringNew(16*1024);
 struct dyString *cdsBuf = NULL;
-struct dyString *indBuf = NULL;
+struct dyString *nameBuf = dyStringNew(64);
 FILE *faFh = mustOpen(faOut, "w");
 FILE *cdsFh = NULL;
 FILE *pslFh = NULL;
 if (cdsOnly || peptides)
     cdsBuf = dyStringNew(16*1024);
 dyStringNew(16*1024);
 if (cdsOut != NULL)
     cdsFh = mustOpen(cdsOut, "w");
 if (pslOut != NULL)
     pslFh = mustOpen(pslOut, "w");
 if (exonIndices)
-   indBuf = dyStringNew(512);
+   nameBuf = dyStringNew(512);
 
 if (fileExists(table))
-    getRnaForFile(db, table, chrom, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh);
+    getRnaForFile(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 else
-    getRnaForTables(db, table, chrom, dnaBuf, cdsBuf, indBuf, faFh, cdsFh, pslFh);
+    getRnaForTables(db, table, chrom, dnaBuf, cdsBuf, nameBuf, faFh, cdsFh, pslFh);
 
 dyStringFree(&dnaBuf);
 dyStringFree(&cdsBuf);
-dyStringFree(&indBuf);
+dyStringFree(&nameBuf);
 carefulClose(&pslFh);
 carefulClose(&cdsFh);
 carefulClose(&faFh);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 dnaUtilOpen();
 weird = optionExists("weird");
 cdsUpper = optionExists("cdsUpper");
 cdsOnly = optionExists("cdsOnly");
 cdsOut = optionVal("cdsOut", NULL); 
 keepMasking = optionExists("keepMasking"); 
 pslOut = optionVal("pslOut", NULL); 
 suffix = optionVal("suffix", suffix); 
 peptides = optionExists("peptides");
+includeCoords = optionExists("includeCoords");
 exonIndices = optionExists("exonIndices");
 maxSize = optionInt("maxSize", -1);
 genePredExt = optionExists("genePredExt");
 if (cdsOnly && peptides)
     errAbort("can't specify both -cdsOnly and -peptides");
 if (cdsUpper && keepMasking)
     errAbort("can't specify both -cdsUpper and -keepMasking");
 if (exonIndices && (cdsOnly || peptides))
     errAbort("can't specify -exonIndices with -cdsOnly or -peptides");
 if (maxSize != -1 && peptides)
     errAbort("can't specify -maxSize with -peptides");
 genomeSeqs = optionVal("genomeSeqs", NULL);
 getRnaPred(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }