0f258c2453300987f007160676a1a64ddca7f47f
markd
  Mon Jan 18 17:38:34 2016 -0800
added ability to request specific version in getRna and generate error if that version isn't available

diff --git src/hg/getRna/getRna.c src/hg/getRna/getRna.c
index 706606d..7811929 100644
--- src/hg/getRna/getRna.c
+++ src/hg/getRna/getRna.c
@@ -1,205 +1,267 @@
 /* getRna - get mrna sequences  */
 
 /* Copyright (C) 2013 The Regents of the University of California 
  * See README in this or parent directory for licensing information. */
 #include "common.h"
 #include "options.h"
 #include "hdb.h"
 #include "jksql.h"
 #include "genbank.h"
 #include "linefile.h"
 #include "fa.h"
 
 
 /* command line option specifications */
 static struct optionSpec optionSpecs[] = {
     {"cdsUpper", OPTION_BOOLEAN},
     {"cdsUpperAll", OPTION_BOOLEAN},
     {"inclVer", OPTION_BOOLEAN},
     {"peptides", OPTION_BOOLEAN},
     {NULL, 0}
 };
 
 /* command line options */
-boolean cdsUpper = FALSE;
-boolean cdsUpperAll = FALSE;
-boolean inclVer = FALSE;
-boolean peptides = FALSE;
+static boolean cdsUpper = FALSE;
+static boolean cdsUpperAll = FALSE;
+static boolean inclVer = FALSE;
+static boolean peptides = FALSE;
 
-void usage()
+/* derived from command line, it clearer as -cdsUpperAll and -peptides defines
+ * multiple behaviors */
+boolean warnOnNoCds = FALSE;
+boolean skipNoCds = FALSE;
+
+static int errCnt = 0;
+
+static void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "getRna - Get mrna for GenBank or RefSeq sequences found in a database\n"
   "usage:\n"
   "   getRna [options] database accFile outfa\n"
   "\n"
-  "Get mrna for all accessions in accFile, writing to a fasta file. \n"
+  "Get mrna for all accessions in accFile, writing to a fasta file. If accession\n"
+  " has a version, that version is returned or an error generated\n"
   "\n"
   "Options:\n"
   "  -cdsUpper - lookup CDS and output it as upper case. If CDS annotation\n"
   "    can't be obtained, the sequence is skipped with a warning.\n"
   "  -cdsUpperAll - like -cdsUpper, except keep sequeneces without CDS\n"
   "  -inclVer - include version with sequence id.\n"
   "  -peptides - translate mRNAs to peptides\n"
   "\n");
 }
 
-char *getCdsString(struct sqlConnection *conn, char *acc)
+static void parseAccVersion(char* requestedAcc,
+                            char acc[GENBANK_ACC_BUFSZ],
+                            char ver[GENBANK_ACC_BUFSZ])
+/* parse accession and optional version */
+{
+char* verDot = strchr(requestedAcc, '.');
+if (verDot != NULL)
+    {
+    genbankDropVer(acc, requestedAcc);
+    safecpy(ver, GENBANK_ACC_BUFSZ, verDot+1);
+    }
+else
+    {
+    safecpy(acc, GENBANK_ACC_BUFSZ, requestedAcc);
+    ver[0] = '\0';
+    }
+}
+
+static struct dnaSeq* getSeq(struct sqlConnection *conn, char* acc)
+/* get sequence from fasta file */
+{
+HGID seqId;
+char *faSeq = hGetSeqAndId(conn, acc, &seqId);
+if (faSeq == NULL)
+    {
+    fprintf(stderr, "%s\tsequence not in database\n", acc);
+    errCnt++;
+    return NULL;
+    }
+return faFromMemText(faSeq);
+}
+
+static char *getCdsString(struct sqlConnection *conn, char *acc)
 /* get the CDS string for an accession, or null if not found */
 {
 char query[256];
 
 sqlSafef(query, sizeof(query),
       "SELECT cds.name FROM gbCdnaInfo,cds WHERE (gbCdnaInfo.acc = '%s') AND (gbCdnaInfo.cds != 0) AND (gbCdnaInfo.cds = cds.id)",
       acc);
 return sqlQuickString(conn, query);
 }
 
-boolean getCds(struct sqlConnection *conn, char *acc, int mrnaLen,
-               boolean warnOnNoCds, struct genbankCds *cds)
+static boolean getCds(struct sqlConnection *conn, char *acc, int mrnaLen,
+                      struct genbankCds *cds)
 /* get the CDS range for an mRNA, warning and return FALSE if can't obtain
  * CDS or it is longer than mRNA. */
 {
 char *cdsStr = getCdsString(conn, acc);
 
 if (cdsStr == NULL)
     {
     if (warnOnNoCds)
         fprintf(stderr, "%s\tno CDS defined\n", acc);
     return FALSE;
     }
 if (!genbankCdsParse(cdsStr, cds))
     {
     if (warnOnNoCds)
         fprintf(stderr, "%s\tcan't parse CDS: %s\n", acc, cdsStr);
     free(cdsStr);
     return FALSE;
     }
 if (cds->end > mrnaLen)
     {
     if (warnOnNoCds)
         fprintf(stderr, "%s\tCDS exceeds mRNA length: %s\n", acc, cdsStr);
     free(cdsStr);
     return FALSE;
     }
 free(cdsStr);
 return TRUE;
 }
 
-void upperCaseCds(struct dnaSeq *dna, struct genbankCds *cds)
+static void upperCaseCds(struct dnaSeq *dna, struct genbankCds *cds)
 /* uppercase the CDNS */
 {
 tolowers(dna->dna);
 toUpperN(dna->dna+cds->start, (cds->end-cds->start));
 }
 
 int getVersion(struct sqlConnection *conn, char *acc)
 /* get version for acc, or 0 if not found */
 {
 char query[256];
 
 sqlSafef(query, sizeof(query),
       "SELECT version FROM gbCdnaInfo WHERE (gbCdnaInfo.acc = '%s')",
       acc);
 return sqlQuickNum(conn, query);
 }
 
-void writePeptide(FILE *outFa, char *acc, struct dnaSeq *dna, struct genbankCds *cds)
+static void processVersion(struct sqlConnection *conn, char acc[GENBANK_ACC_BUFSZ],
+                           char ver[GENBANK_ACC_BUFSZ])
+/* If the acc has a version, check that the correct version was included.  If
+ * a version is requested in the fasta, update acc argument. */
+{
+int dbVerNum = getVersion(conn, acc);
+char dbVerStr[32];
+safef(dbVerStr, sizeof(dbVerStr), "%d", dbVerNum);
+if ((strlen(ver) > 0) && !sameString(dbVerStr, ver))
+    {
+    fprintf(stderr, "%s.%s requested, database has version %s\n", acc, ver, dbVerStr);
+    errCnt++;
+    }
+if (inclVer)
+    {
+    char accTmp[GENBANK_ACC_BUFSZ];
+    safef(accTmp, sizeof(accTmp), "%s.%d", acc, dbVerNum);
+    safecpy(acc, GENBANK_ACC_BUFSZ, accTmp);
+    }
+}
+
+
+static void writePeptide(FILE *outFa, char *acc, struct dnaSeq *dna, struct genbankCds *cds)
 /* translate the sequence to a peptide and output */
 {
 char *pep = needMem(dna->size); /* more than needed */
 char hold = dna->dna[cds->end];
 dna->dna[cds->end] = '\0';
 dnaTranslateSome(dna->dna+cds->start, pep, dna->size);
 dna->dna[cds->end] = hold;
 faWriteNext(outFa, acc, pep, strlen(pep));
 freeMem(pep);
 }
 
-void getAccMrna(char *acc, struct sqlConnection *conn, FILE *outFa)
+static void getAccMrna(char *requestedAcc, struct sqlConnection *conn, FILE *outFa)
 /* get mrna for an accession */
 {
-HGID seqId;
-char *faSeq;
-struct dnaSeq *dna;
+char acc[GENBANK_ACC_BUFSZ];
+char ver[GENBANK_ACC_BUFSZ];
 boolean cdsOk = TRUE;
-char accBuf[512];
 struct genbankCds cds;
 
-faSeq = hGetSeqAndId(conn, acc, &seqId);
-if (faSeq == NULL)
-    {
-    fprintf(stderr, "%s\tsequence not in database\n", acc);
+parseAccVersion(requestedAcc, acc, ver);
+
+struct dnaSeq *dna = getSeq(conn, acc);
+if (dna == NULL)
     return;  
-    }
-dna = faFromMemText(faSeq);
 
 if (cdsUpper || peptides)
-    cdsOk = getCds(conn, acc, dna->size, !cdsUpperAll, &cds);
+    {
+    cdsOk = getCds(conn, acc, dna->size, &cds);
+    if ((!cdsOk) && skipNoCds)
+        {
+        dnaSeqFree(&dna);
+        return;
+        }
+    }
 
 if (cdsOk && cdsUpper)
     upperCaseCds(dna, &cds);
-if ((cdsOk || cdsUpperAll) && inclVer)
-    {
-    int ver = getVersion(conn, acc);
-    safef(accBuf, sizeof(accBuf), "%s.%d", acc, ver);
-    acc = accBuf;
-    }
+if (inclVer || (strlen(ver) > 0))
+    processVersion(conn, acc, ver);
 
-if ((cdsOk || cdsUpperAll))
-    {
 if (peptides)
     writePeptide(outFa, acc, dna, &cds);
 else
     faWriteNext(outFa, acc, dna->dna, dna->size);
-    }
 
 dnaSeqFree(&dna);
 }
 
-void getRna(char *database, char *accFile, char *outFaFile)
+static void getRna(char *database, char *accFile, char *outFaFile)
 /* obtain mrna for a list of accessions */
 {
 struct sqlConnection *conn = sqlConnect(database);
 struct lineFile *accLf = lineFileOpen(accFile, TRUE); 
 FILE *outFa = mustOpen(outFaFile, "w"); 
 char *line;
 int lineSize;
 
 while (lineFileNext(accLf, &line, &lineSize))
     {
     getAccMrna(trimSpaces(line), conn, outFa);
     }
 
 if (ferror(outFa))
     errAbort("error writing %s", outFaFile);
 carefulClose(&outFa);
 lineFileClose(&accLf);
 sqlDisconnect(&conn);
 
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 char *database, *accFile, *outFaFile;
 
 optionInit(&argc, argv, optionSpecs);
 if (argc != 4)
     usage();
 database = argv[1];
 accFile = argv[2];
 outFaFile = argv[3];
 cdsUpper = optionExists("cdsUpper");
 cdsUpperAll = optionExists("cdsUpperAll");
+warnOnNoCds = !cdsUpperAll;
+skipNoCds = cdsUpper;
 if (cdsUpperAll)
     cdsUpper = TRUE;
 inclVer = optionExists("inclVer");
 peptides = optionExists("peptides");
+if (peptides)
+    skipNoCds = TRUE;
 if (peptides && (cdsUpper || cdsUpperAll))
     errAbort("can't specify -peptides with -cdsUpper or -cdsUpperAll");
 getRna(database, accFile, outFaFile);
-return 0;
+return (errCnt == 0) ? 0 : 1;
 }