src/hg/utils/refSeqGet/refSeqGet.c 1.2
1.2 2009/11/23 17:45:19 markd
added CDS to meta data
Index: src/hg/utils/refSeqGet/refSeqGet.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/refSeqGet/refSeqGet.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/utils/refSeqGet/refSeqGet.c 23 Nov 2009 02:56:20 -0000 1.1
+++ src/hg/utils/refSeqGet/refSeqGet.c 23 Nov 2009 17:45:19 -0000 1.2
@@ -1,274 +1,286 @@
/* refSeqGet - retrieve refseq data from the database. */
#include "common.h"
#include "refSeqVerInfo.h"
#include "linefile.h"
#include "hash.h"
#include "options.h"
#include "jksql.h"
#include "hdb.h"
#include "psl.h"
#include "genePred.h"
#include "genePredReader.h"
#include "genbank.h"
#include "fa.h"
static char const rcsid[] = "$Id$";
void usage()
/* Explain usage and exit. */
{
errAbort(
"refSeqGet - retrieve refseq data from the database\n"
"usage:\n"
" refSeqGet db\n"
"\n"
"Get a consistent snapshot of the RefSeq data from a genome browser database.\n"
"All accessions will include version numbers. Only ReqSeqs for the particular\n"
"are retrieved.\n"
"\n"
"options:\n"
" -aligns=pslFile - get the PSL alignments of the RefSeqs to the genome.\n"
" -geneAnns=genePred - get the genome annotations in genePred format.\n"
" -rnaSeqs=mrnaFa - get a FASTA file of the mRNA/RNA sequences with the CDS capitalized.\n"
" -protSeqs=protFa - get a FASTA file of the protein sequences.\n"
" -metaData=metaTsv - get a tab-separated file of the meta-data for the RefSeqs\n"
" -getNM - get NM sequences, if neither -getNM or -getNR is specified, all are return\n"
" -getNR - get the NR sequences\n"
" -accList=accfile - only get data for this set of accessions. If version is included,\n"
" then only the specified version is retrieved, otherwise the version loaded is\n"
" retrieved. This is incompatible with -getNR or -inclNM\n");
}
/* Notes:
* This isn't very fast, it was done for simplicity more than speed.
*/
static struct optionSpec options[] = {
{"aligns", OPTION_STRING},
{"geneAnns", OPTION_STRING},
{"rnaSeqs", OPTION_STRING},
{"protSeqs", OPTION_STRING},
{"metaData", OPTION_STRING},
{"getNM", OPTION_BOOLEAN},
{"getNR", OPTION_BOOLEAN},
{"accList", OPTION_STRING},
{NULL, 0},
};
static char *addVer(char *acc, int ver, char *buf, int bufSize)
/* construct acc.ver in buffer */
{
safef(buf, bufSize, "%s.%d", acc, ver);
return buf;
}
static void processPsl(FILE *fh, struct hash *refSeqVerInfoTbl, struct psl *psl)
/* check if a psl has been select, if so, write including version in qName */
{
struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, psl->qName);
if (rsvi != NULL)
{
char buf[GENBANK_ACC_BUFSZ], *hold = psl->qName;
psl->qName = addVer(psl->qName, rsvi->ver, buf, sizeof(buf));
pslTabOut(psl, fh);
psl->qName = hold;
}
}
static void getAligns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
/* get request alignments from database */
{
int off = hOffsetPastBin(sqlGetDatabase(conn), NULL, "refSeqAli");
struct sqlResult *sr = sqlGetResult(conn, "SELECT * FROM refSeqAli");
FILE *fh = mustOpen(outFile, "w");
char **row;
while ((row = sqlNextRow(sr)) != NULL)
{
struct psl *psl = pslLoad(row+off);
processPsl(fh, refSeqVerInfoTbl, psl);
pslFree(&psl);
}
carefulClose(&fh);
sqlFreeResult(&sr);
}
static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp)
/* check if a genePred has been select, if so, write including version in name */
{
struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name);
if (rsvi != NULL)
{
char buf[GENBANK_ACC_BUFSZ], *hold = gp->name;
gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf));
genePredTabOut(gp, fh);
gp->name = hold;
}
}
static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
/* get request genePred annotations from database */
{
struct genePredReader *gpr = genePredReaderQuery(conn, "refGene", NULL);
FILE *fh = mustOpen(outFile, "w");
struct genePred *gp;
while ((gp = genePredReaderNext(gpr)) != NULL)
{
processGenePred(fh, refSeqVerInfoTbl, gp);
genePredFree(&gp);
}
carefulClose(&fh);
genePredReaderFree(&gpr);
}
static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
/* get an RNA sequence, which already includes version in name */
{
struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
if (seq == NULL)
errAbort("failed to get %s from database", rsvi->acc);
faWriteNext(fh, seq->name, seq->dna, seq->size);
dnaSeqFree(&seq);
}
static void getRnaSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
/* get request RNA sequences from database */
{
FILE *fh = mustOpen(outFile, "w");
struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
{
processRnaSeq(fh, conn, hel->val);
}
carefulClose(&fh);
}
static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
/* get an protein sequence, which already includes version in name. Don't duplicate NPs */
{
char query[128];
safef(query, sizeof(query), "SELECT protAcc FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
char *protAcc = sqlNeedQuickString(conn, query);
if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
{
struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
if (seq == NULL)
errAbort("failed to get %s from database", protAcc);
faWriteNext(fh, seq->name, seq->dna, seq->size);
dnaSeqFree(&seq);
hashAdd(doneProts, protAcc, NULL);
}
freeMem(protAcc);
}
static void getProtSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
/* get request prot sequences from database */
{
struct hash *doneProts = hashNew(16);
FILE *fh = mustOpen(outFile, "w");
struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
{
processProtSeq(fh, conn, hel->val, doneProts);
}
carefulClose(&fh);
hashFree(&doneProts);
}
static char *getProtAccVerIf(struct sqlConnection *conn, char *mrnaAcc, char *protAcc, char *buf, int bufSize)
/* if protAcc is not empty, get acc.version, otherwise return empty */
{
if (isEmpty(protAcc))
return "";
else
{
int protVer = refSeqVerInfoGetVersion(protAcc, conn);
if (protVer == 0)
errAbort("no protein version entry for %s associated with %s", protAcc, mrnaAcc);
safef(buf, bufSize, "%s.%d", protAcc, protVer);
return buf;
}
}
+static char *getCds(struct sqlConnection *conn, char *acc)
+/* get CDS for an NM, results should be freed */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT cds.name FROM gbCdnaInfo,cds WHERE (gbCdnaInfo.acc = \"%s\") AND (gbCdnaInfo.cds = cds.id)", acc);
+return sqlNeedQuickString(conn, query);
+}
+
static void processMetaData(FILE *fh, struct sqlConnection *conn, struct sqlConnection *conn2, struct refSeqVerInfo *rsvi)
/* get meta data for an accession */
{
+boolean isCoding = genbankIsRefSeqCodingMRnaAcc(rsvi->acc);
char query[128];
safef(query, sizeof(query), "SELECT name,product,protAcc,locusLinkId FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
struct sqlResult *sr = sqlGetResult(conn, query);
char **row = sqlNextRow(sr);
if (row == NULL)
errAbort("no RefLink entry for %s", rsvi->acc);
char buf[32];
char *protAccVer = getProtAccVerIf(conn2, rsvi->acc, row[2], buf, sizeof(buf));
-fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], row[1]);
+char *cds = isCoding ? getCds(conn2, rsvi->acc) : "";
+fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], cds, row[1]);
sqlFreeResult(&sr);
+if (isCoding)
+ freeMem(cds);
}
static void getMetaData(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
/* get request prot sequences from database */
{
struct sqlConnection *conn2 = sqlConnect(sqlGetDatabase(conn));
-static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "product\n";
+static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "cds\t" "product\n";
FILE *fh = mustOpen(outFile, "w");
fputs(hdr, fh);
struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
struct hashEl *hel;
while ((hel = hashNext(&cookie)) != NULL)
{
processMetaData(fh, conn, conn2, hel->val);
}
carefulClose(&fh);
sqlDisconnect(&conn2);
}
static void refSeqGet(char *db, char *aligns, char *geneAnns, char *rnaSeqs, char *protSeqs,
char *metaData, boolean getNM, boolean getNR, char *accList)
/* refSeqGet - retrieve refseq data from the database. */
{
struct sqlConnection *conn = sqlConnect(db);
struct hash *refSeqVerInfoTbl = (accList == NULL)
? refSeqVerInfoFromDb(conn, getNM, getNR)
: refSeqVerInfoFromFile(conn, accList);
if (aligns != NULL)
getAligns(conn, refSeqVerInfoTbl, aligns);
if (geneAnns != NULL)
getGeneAnns(conn, refSeqVerInfoTbl, geneAnns);
if (rnaSeqs != NULL)
getRnaSeqs(conn, refSeqVerInfoTbl, rnaSeqs);
if (protSeqs != NULL)
getProtSeqs(conn, refSeqVerInfoTbl, protSeqs);
if (metaData != NULL)
getMetaData(conn, refSeqVerInfoTbl, metaData);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 2)
usage();
if (optionExists("accList"))
{
if (optionExists("getNM"))
errAbort("can't specify both -accList and -getNM");
if (optionExists("getNR"))
errAbort("can't specify both -accList and -getNR");
}
boolean getNM = optionExists("getNM");
boolean getNR = optionExists("getNR");
if (!(getNM || getNR))
getNM = getNR = TRUE;
refSeqGet(argv[1],
optionVal("aligns", NULL),
optionVal("geneAnns", NULL),
optionVal("rnaSeqs", NULL),
optionVal("protSeqs", NULL),
optionVal("metaData", NULL),
getNM, getNR,
optionVal("accList", NULL));
return 0;
}