src/hg/utils/refSeqGet/refSeqGet.c 1.1

1.1 2009/11/23 02:56:20 markd
added program to get consistent versions of refseq data from database
Index: src/hg/utils/refSeqGet/refSeqGet.c
===================================================================
RCS file: src/hg/utils/refSeqGet/refSeqGet.c
diff -N src/hg/utils/refSeqGet/refSeqGet.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/utils/refSeqGet/refSeqGet.c	23 Nov 2009 02:56:20 -0000	1.1
@@ -0,0 +1,274 @@
+/* refSeqGet - retrieve refseq data from the database. */
+#include "common.h"
+#include "refSeqVerInfo.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hdb.h"
+#include "psl.h"
+#include "genePred.h"
+#include "genePredReader.h"
+#include "genbank.h"
+#include "fa.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "refSeqGet - retrieve refseq data from the database\n"
+  "usage:\n"
+  "   refSeqGet db\n"
+  "\n"
+  "Get a consistent snapshot of the RefSeq data from a genome browser database.\n"
+  "All accessions will include version numbers.  Only ReqSeqs for the particular\n"
+  "are retrieved.\n"
+  "\n"
+  "options:\n"
+  "   -aligns=pslFile - get the PSL alignments of the RefSeqs to the genome.\n"
+  "   -geneAnns=genePred - get the genome annotations in genePred format.\n"
+  "   -rnaSeqs=mrnaFa - get a FASTA file of the mRNA/RNA sequences with the CDS capitalized.\n"
+  "   -protSeqs=protFa - get a FASTA file of the protein sequences.\n"
+  "   -metaData=metaTsv - get a tab-separated file of the meta-data for the RefSeqs\n"
+  "   -getNM - get NM sequences, if neither -getNM or -getNR is specified, all are return\n"
+  "   -getNR - get the NR sequences\n"
+  "   -accList=accfile - only get data for this set of accessions.  If version is included,\n"
+  "    then only the specified version is retrieved, otherwise the version loaded is\n"
+  "    retrieved.  This is incompatible with -getNR or -inclNM\n");
+}
+
+/* Notes:
+ * This isn't very fast, it was done for simplicity more than speed.
+ */
+
+static struct optionSpec options[] = {
+    {"aligns", OPTION_STRING},
+    {"geneAnns", OPTION_STRING},
+    {"rnaSeqs", OPTION_STRING},
+    {"protSeqs", OPTION_STRING},
+    {"metaData", OPTION_STRING},
+    {"getNM", OPTION_BOOLEAN},
+    {"getNR", OPTION_BOOLEAN},
+    {"accList", OPTION_STRING},
+    {NULL, 0},
+};
+
+static char *addVer(char *acc, int ver, char *buf, int bufSize)
+/* construct acc.ver in buffer */
+{
+safef(buf, bufSize, "%s.%d", acc, ver);
+return buf;
+}
+
+static void processPsl(FILE *fh, struct hash *refSeqVerInfoTbl, struct psl *psl)
+/* check if a psl has been select, if so, write including version in qName */
+{
+struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, psl->qName);
+if (rsvi != NULL)
+    {
+    char buf[GENBANK_ACC_BUFSZ], *hold = psl->qName;
+    psl->qName = addVer(psl->qName, rsvi->ver, buf, sizeof(buf));
+    pslTabOut(psl, fh);
+    psl->qName = hold;
+    }
+}
+
+static void getAligns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request alignments from database */
+{
+int off = hOffsetPastBin(sqlGetDatabase(conn), NULL, "refSeqAli");
+struct sqlResult *sr = sqlGetResult(conn, "SELECT * FROM refSeqAli");
+FILE *fh = mustOpen(outFile, "w");
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    struct psl *psl = pslLoad(row+off);
+    processPsl(fh, refSeqVerInfoTbl, psl);
+    pslFree(&psl);
+    }
+carefulClose(&fh);
+sqlFreeResult(&sr);
+}
+
+static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp)
+/* check if a genePred has been select, if so, write including version in name */
+{
+struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name);
+if (rsvi != NULL)
+    {
+    char buf[GENBANK_ACC_BUFSZ], *hold = gp->name;
+    gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf));
+    genePredTabOut(gp, fh);
+    gp->name = hold;
+    }
+}
+
+static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request genePred annotations from database */
+{
+struct genePredReader *gpr = genePredReaderQuery(conn, "refGene", NULL);
+FILE *fh = mustOpen(outFile, "w");
+struct genePred *gp;
+while ((gp = genePredReaderNext(gpr)) != NULL)
+    {
+    processGenePred(fh, refSeqVerInfoTbl, gp);
+    genePredFree(&gp);
+    }
+carefulClose(&fh);
+genePredReaderFree(&gpr);
+}
+
+static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
+/* get an RNA sequence, which already includes version in name */
+{
+struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
+if (seq == NULL)
+    errAbort("failed to get %s from database", rsvi->acc);
+faWriteNext(fh, seq->name, seq->dna, seq->size);
+dnaSeqFree(&seq);
+}
+
+static void getRnaSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request RNA sequences from database */
+{
+FILE *fh = mustOpen(outFile, "w");
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    processRnaSeq(fh, conn, hel->val);
+    }
+carefulClose(&fh);
+}
+
+static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
+/* get an protein sequence, which already includes version in name. Don't duplicate NPs */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT protAcc FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
+char *protAcc = sqlNeedQuickString(conn, query);
+if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
+    {
+    struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
+    if (seq == NULL)
+        errAbort("failed to get %s from database", protAcc);
+    faWriteNext(fh, seq->name, seq->dna, seq->size);
+    dnaSeqFree(&seq);
+    hashAdd(doneProts, protAcc, NULL);
+    }
+freeMem(protAcc);
+}
+
+static void getProtSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request prot sequences from database */
+{
+struct hash *doneProts = hashNew(16);
+FILE *fh = mustOpen(outFile, "w");
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    processProtSeq(fh, conn, hel->val, doneProts);
+    }
+carefulClose(&fh);
+hashFree(&doneProts);
+}
+
+static char *getProtAccVerIf(struct sqlConnection *conn, char *mrnaAcc, char *protAcc, char *buf, int bufSize)
+/* if protAcc is not empty, get acc.version, otherwise return empty */
+{
+if (isEmpty(protAcc))
+    return "";
+else
+    {
+    int protVer = refSeqVerInfoGetVersion(protAcc, conn);
+    if (protVer == 0)
+        errAbort("no protein version entry for %s associated with %s", protAcc, mrnaAcc);
+    safef(buf, bufSize, "%s.%d", protAcc, protVer);
+    return buf;
+    }
+}
+
+static void processMetaData(FILE *fh, struct sqlConnection *conn, struct sqlConnection *conn2, struct refSeqVerInfo *rsvi)
+/* get meta data for an accession */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT name,product,protAcc,locusLinkId FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = sqlNextRow(sr);
+if (row == NULL)
+    errAbort("no RefLink entry for %s", rsvi->acc);
+char buf[32];
+char *protAccVer = getProtAccVerIf(conn2, rsvi->acc, row[2], buf, sizeof(buf));
+fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], row[1]);
+sqlFreeResult(&sr);
+}
+
+static void getMetaData(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request prot sequences from database */
+{
+struct sqlConnection *conn2 = sqlConnect(sqlGetDatabase(conn));
+static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "product\n";
+FILE *fh = mustOpen(outFile, "w");
+fputs(hdr, fh);
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+    {
+    processMetaData(fh, conn, conn2, hel->val);
+    }
+carefulClose(&fh);
+sqlDisconnect(&conn2);
+}
+
+static void refSeqGet(char *db, char *aligns, char *geneAnns, char *rnaSeqs, char *protSeqs,
+                      char *metaData, boolean getNM, boolean getNR, char *accList)
+/* refSeqGet - retrieve refseq data from the database. */
+{
+struct sqlConnection *conn = sqlConnect(db);
+struct hash *refSeqVerInfoTbl = (accList == NULL)
+    ? refSeqVerInfoFromDb(conn, getNM, getNR)
+    : refSeqVerInfoFromFile(conn, accList);
+if (aligns != NULL)
+    getAligns(conn, refSeqVerInfoTbl, aligns);
+if (geneAnns != NULL)
+    getGeneAnns(conn, refSeqVerInfoTbl, geneAnns);
+if (rnaSeqs != NULL)
+    getRnaSeqs(conn, refSeqVerInfoTbl, rnaSeqs);
+if (protSeqs != NULL)
+    getProtSeqs(conn, refSeqVerInfoTbl, protSeqs);
+if (metaData != NULL)
+    getMetaData(conn, refSeqVerInfoTbl, metaData);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+    usage();
+
+if (optionExists("accList"))
+    {
+    if (optionExists("getNM"))
+        errAbort("can't specify both -accList and -getNM");
+    if (optionExists("getNR"))
+        errAbort("can't specify both -accList and -getNR");
+    }
+boolean getNM = optionExists("getNM");
+boolean getNR = optionExists("getNR");
+if (!(getNM || getNR))
+    getNM = getNR = TRUE;
+
+refSeqGet(argv[1],
+          optionVal("aligns", NULL),
+          optionVal("geneAnns", NULL),
+          optionVal("rnaSeqs", NULL),
+          optionVal("protSeqs", NULL),
+          optionVal("metaData", NULL),
+          getNM, getNR,
+          optionVal("accList", NULL));
+return 0;
+}