src/hg/utils/refSeqGet/refSeqGet.c 1.1
1.1 2009/11/23 02:56:20 markd
added program to get consistent versions of refseq data from database
Index: src/hg/utils/refSeqGet/refSeqGet.c
===================================================================
RCS file: src/hg/utils/refSeqGet/refSeqGet.c
diff -N src/hg/utils/refSeqGet/refSeqGet.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/utils/refSeqGet/refSeqGet.c 23 Nov 2009 02:56:20 -0000 1.1
@@ -0,0 +1,274 @@
+/* refSeqGet - retrieve refseq data from the database. */
+#include "common.h"
+#include "refSeqVerInfo.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "jksql.h"
+#include "hdb.h"
+#include "psl.h"
+#include "genePred.h"
+#include "genePredReader.h"
+#include "genbank.h"
+#include "fa.h"
+
+static char const rcsid[] = "$Id$";
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+ "refSeqGet - retrieve refseq data from the database\n"
+ "usage:\n"
+ " refSeqGet db\n"
+ "\n"
+ "Get a consistent snapshot of the RefSeq data from a genome browser database.\n"
+ "All accessions will include version numbers. Only ReqSeqs for the particular\n"
+ "are retrieved.\n"
+ "\n"
+ "options:\n"
+ " -aligns=pslFile - get the PSL alignments of the RefSeqs to the genome.\n"
+ " -geneAnns=genePred - get the genome annotations in genePred format.\n"
+ " -rnaSeqs=mrnaFa - get a FASTA file of the mRNA/RNA sequences with the CDS capitalized.\n"
+ " -protSeqs=protFa - get a FASTA file of the protein sequences.\n"
+ " -metaData=metaTsv - get a tab-separated file of the meta-data for the RefSeqs\n"
+ " -getNM - get NM sequences, if neither -getNM or -getNR is specified, all are return\n"
+ " -getNR - get the NR sequences\n"
+ " -accList=accfile - only get data for this set of accessions. If version is included,\n"
+ " then only the specified version is retrieved, otherwise the version loaded is\n"
+ " retrieved. This is incompatible with -getNR or -inclNM\n");
+}
+
+/* Notes:
+ * This isn't very fast, it was done for simplicity more than speed.
+ */
+
+static struct optionSpec options[] = {
+ {"aligns", OPTION_STRING},
+ {"geneAnns", OPTION_STRING},
+ {"rnaSeqs", OPTION_STRING},
+ {"protSeqs", OPTION_STRING},
+ {"metaData", OPTION_STRING},
+ {"getNM", OPTION_BOOLEAN},
+ {"getNR", OPTION_BOOLEAN},
+ {"accList", OPTION_STRING},
+ {NULL, 0},
+};
+
+static char *addVer(char *acc, int ver, char *buf, int bufSize)
+/* construct acc.ver in buffer */
+{
+safef(buf, bufSize, "%s.%d", acc, ver);
+return buf;
+}
+
+static void processPsl(FILE *fh, struct hash *refSeqVerInfoTbl, struct psl *psl)
+/* check if a psl has been select, if so, write including version in qName */
+{
+struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, psl->qName);
+if (rsvi != NULL)
+ {
+ char buf[GENBANK_ACC_BUFSZ], *hold = psl->qName;
+ psl->qName = addVer(psl->qName, rsvi->ver, buf, sizeof(buf));
+ pslTabOut(psl, fh);
+ psl->qName = hold;
+ }
+}
+
+static void getAligns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request alignments from database */
+{
+int off = hOffsetPastBin(sqlGetDatabase(conn), NULL, "refSeqAli");
+struct sqlResult *sr = sqlGetResult(conn, "SELECT * FROM refSeqAli");
+FILE *fh = mustOpen(outFile, "w");
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ struct psl *psl = pslLoad(row+off);
+ processPsl(fh, refSeqVerInfoTbl, psl);
+ pslFree(&psl);
+ }
+carefulClose(&fh);
+sqlFreeResult(&sr);
+}
+
+static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp)
+/* check if a genePred has been select, if so, write including version in name */
+{
+struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name);
+if (rsvi != NULL)
+ {
+ char buf[GENBANK_ACC_BUFSZ], *hold = gp->name;
+ gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf));
+ genePredTabOut(gp, fh);
+ gp->name = hold;
+ }
+}
+
+static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request genePred annotations from database */
+{
+struct genePredReader *gpr = genePredReaderQuery(conn, "refGene", NULL);
+FILE *fh = mustOpen(outFile, "w");
+struct genePred *gp;
+while ((gp = genePredReaderNext(gpr)) != NULL)
+ {
+ processGenePred(fh, refSeqVerInfoTbl, gp);
+ genePredFree(&gp);
+ }
+carefulClose(&fh);
+genePredReaderFree(&gpr);
+}
+
+static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
+/* get an RNA sequence, which already includes version in name */
+{
+struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
+if (seq == NULL)
+ errAbort("failed to get %s from database", rsvi->acc);
+faWriteNext(fh, seq->name, seq->dna, seq->size);
+dnaSeqFree(&seq);
+}
+
+static void getRnaSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request RNA sequences from database */
+{
+FILE *fh = mustOpen(outFile, "w");
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+ {
+ processRnaSeq(fh, conn, hel->val);
+ }
+carefulClose(&fh);
+}
+
+static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
+/* get an protein sequence, which already includes version in name. Don't duplicate NPs */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT protAcc FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
+char *protAcc = sqlNeedQuickString(conn, query);
+if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
+ {
+ struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
+ if (seq == NULL)
+ errAbort("failed to get %s from database", protAcc);
+ faWriteNext(fh, seq->name, seq->dna, seq->size);
+ dnaSeqFree(&seq);
+ hashAdd(doneProts, protAcc, NULL);
+ }
+freeMem(protAcc);
+}
+
+static void getProtSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request prot sequences from database */
+{
+struct hash *doneProts = hashNew(16);
+FILE *fh = mustOpen(outFile, "w");
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+ {
+ processProtSeq(fh, conn, hel->val, doneProts);
+ }
+carefulClose(&fh);
+hashFree(&doneProts);
+}
+
+static char *getProtAccVerIf(struct sqlConnection *conn, char *mrnaAcc, char *protAcc, char *buf, int bufSize)
+/* if protAcc is not empty, get acc.version, otherwise return empty */
+{
+if (isEmpty(protAcc))
+ return "";
+else
+ {
+ int protVer = refSeqVerInfoGetVersion(protAcc, conn);
+ if (protVer == 0)
+ errAbort("no protein version entry for %s associated with %s", protAcc, mrnaAcc);
+ safef(buf, bufSize, "%s.%d", protAcc, protVer);
+ return buf;
+ }
+}
+
+static void processMetaData(FILE *fh, struct sqlConnection *conn, struct sqlConnection *conn2, struct refSeqVerInfo *rsvi)
+/* get meta data for an accession */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT name,product,protAcc,locusLinkId FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row = sqlNextRow(sr);
+if (row == NULL)
+ errAbort("no RefLink entry for %s", rsvi->acc);
+char buf[32];
+char *protAccVer = getProtAccVerIf(conn2, rsvi->acc, row[2], buf, sizeof(buf));
+fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], row[1]);
+sqlFreeResult(&sr);
+}
+
+static void getMetaData(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
+/* get request prot sequences from database */
+{
+struct sqlConnection *conn2 = sqlConnect(sqlGetDatabase(conn));
+static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "product\n";
+FILE *fh = mustOpen(outFile, "w");
+fputs(hdr, fh);
+struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
+struct hashEl *hel;
+while ((hel = hashNext(&cookie)) != NULL)
+ {
+ processMetaData(fh, conn, conn2, hel->val);
+ }
+carefulClose(&fh);
+sqlDisconnect(&conn2);
+}
+
+static void refSeqGet(char *db, char *aligns, char *geneAnns, char *rnaSeqs, char *protSeqs,
+ char *metaData, boolean getNM, boolean getNR, char *accList)
+/* refSeqGet - retrieve refseq data from the database. */
+{
+struct sqlConnection *conn = sqlConnect(db);
+struct hash *refSeqVerInfoTbl = (accList == NULL)
+ ? refSeqVerInfoFromDb(conn, getNM, getNR)
+ : refSeqVerInfoFromFile(conn, accList);
+if (aligns != NULL)
+ getAligns(conn, refSeqVerInfoTbl, aligns);
+if (geneAnns != NULL)
+ getGeneAnns(conn, refSeqVerInfoTbl, geneAnns);
+if (rnaSeqs != NULL)
+ getRnaSeqs(conn, refSeqVerInfoTbl, rnaSeqs);
+if (protSeqs != NULL)
+ getProtSeqs(conn, refSeqVerInfoTbl, protSeqs);
+if (metaData != NULL)
+ getMetaData(conn, refSeqVerInfoTbl, metaData);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 2)
+ usage();
+
+if (optionExists("accList"))
+ {
+ if (optionExists("getNM"))
+ errAbort("can't specify both -accList and -getNM");
+ if (optionExists("getNR"))
+ errAbort("can't specify both -accList and -getNR");
+ }
+boolean getNM = optionExists("getNM");
+boolean getNR = optionExists("getNR");
+if (!(getNM || getNR))
+ getNM = getNR = TRUE;
+
+refSeqGet(argv[1],
+ optionVal("aligns", NULL),
+ optionVal("geneAnns", NULL),
+ optionVal("rnaSeqs", NULL),
+ optionVal("protSeqs", NULL),
+ optionVal("metaData", NULL),
+ getNM, getNR,
+ optionVal("accList", NULL));
+return 0;
+}