src/hg/utils/refSeqGet/refSeqGet.c 1.2

1.2 2009/11/23 17:45:19 markd
added CDS to meta data
Index: src/hg/utils/refSeqGet/refSeqGet.c
===================================================================
RCS file: /projects/compbio/cvsroot/kent/src/hg/utils/refSeqGet/refSeqGet.c,v
retrieving revision 1.1
retrieving revision 1.2
diff -b -B -U 1000000 -r1.1 -r1.2
--- src/hg/utils/refSeqGet/refSeqGet.c	23 Nov 2009 02:56:20 -0000	1.1
+++ src/hg/utils/refSeqGet/refSeqGet.c	23 Nov 2009 17:45:19 -0000	1.2
@@ -1,274 +1,286 @@
 /* refSeqGet - retrieve refseq data from the database. */
 #include "common.h"
 #include "refSeqVerInfo.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "jksql.h"
 #include "hdb.h"
 #include "psl.h"
 #include "genePred.h"
 #include "genePredReader.h"
 #include "genbank.h"
 #include "fa.h"
 
 static char const rcsid[] = "$Id$";
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "refSeqGet - retrieve refseq data from the database\n"
   "usage:\n"
   "   refSeqGet db\n"
   "\n"
   "Get a consistent snapshot of the RefSeq data from a genome browser database.\n"
   "All accessions will include version numbers.  Only ReqSeqs for the particular\n"
   "are retrieved.\n"
   "\n"
   "options:\n"
   "   -aligns=pslFile - get the PSL alignments of the RefSeqs to the genome.\n"
   "   -geneAnns=genePred - get the genome annotations in genePred format.\n"
   "   -rnaSeqs=mrnaFa - get a FASTA file of the mRNA/RNA sequences with the CDS capitalized.\n"
   "   -protSeqs=protFa - get a FASTA file of the protein sequences.\n"
   "   -metaData=metaTsv - get a tab-separated file of the meta-data for the RefSeqs\n"
   "   -getNM - get NM sequences, if neither -getNM or -getNR is specified, all are return\n"
   "   -getNR - get the NR sequences\n"
   "   -accList=accfile - only get data for this set of accessions.  If version is included,\n"
   "    then only the specified version is retrieved, otherwise the version loaded is\n"
   "    retrieved.  This is incompatible with -getNR or -inclNM\n");
 }
 
 /* Notes:
  * This isn't very fast, it was done for simplicity more than speed.
  */
 
 static struct optionSpec options[] = {
     {"aligns", OPTION_STRING},
     {"geneAnns", OPTION_STRING},
     {"rnaSeqs", OPTION_STRING},
     {"protSeqs", OPTION_STRING},
     {"metaData", OPTION_STRING},
     {"getNM", OPTION_BOOLEAN},
     {"getNR", OPTION_BOOLEAN},
     {"accList", OPTION_STRING},
     {NULL, 0},
 };
 
 static char *addVer(char *acc, int ver, char *buf, int bufSize)
 /* construct acc.ver in buffer */
 {
 safef(buf, bufSize, "%s.%d", acc, ver);
 return buf;
 }
 
 static void processPsl(FILE *fh, struct hash *refSeqVerInfoTbl, struct psl *psl)
 /* check if a psl has been select, if so, write including version in qName */
 {
 struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, psl->qName);
 if (rsvi != NULL)
     {
     char buf[GENBANK_ACC_BUFSZ], *hold = psl->qName;
     psl->qName = addVer(psl->qName, rsvi->ver, buf, sizeof(buf));
     pslTabOut(psl, fh);
     psl->qName = hold;
     }
 }
 
 static void getAligns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request alignments from database */
 {
 int off = hOffsetPastBin(sqlGetDatabase(conn), NULL, "refSeqAli");
 struct sqlResult *sr = sqlGetResult(conn, "SELECT * FROM refSeqAli");
 FILE *fh = mustOpen(outFile, "w");
 char **row;
 while ((row = sqlNextRow(sr)) != NULL)
     {
     struct psl *psl = pslLoad(row+off);
     processPsl(fh, refSeqVerInfoTbl, psl);
     pslFree(&psl);
     }
 carefulClose(&fh);
 sqlFreeResult(&sr);
 }
 
 static void processGenePred(FILE *fh, struct hash *refSeqVerInfoTbl, struct genePred *gp)
 /* check if a genePred has been select, if so, write including version in name */
 {
 struct refSeqVerInfo *rsvi = hashFindVal(refSeqVerInfoTbl, gp->name);
 if (rsvi != NULL)
     {
     char buf[GENBANK_ACC_BUFSZ], *hold = gp->name;
     gp->name = addVer(gp->name, rsvi->ver, buf, sizeof(buf));
     genePredTabOut(gp, fh);
     gp->name = hold;
     }
 }
 
 static void getGeneAnns(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request genePred annotations from database */
 {
 struct genePredReader *gpr = genePredReaderQuery(conn, "refGene", NULL);
 FILE *fh = mustOpen(outFile, "w");
 struct genePred *gp;
 while ((gp = genePredReaderNext(gpr)) != NULL)
     {
     processGenePred(fh, refSeqVerInfoTbl, gp);
     genePredFree(&gp);
     }
 carefulClose(&fh);
 genePredReaderFree(&gpr);
 }
 
 static void processRnaSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi)
 /* get an RNA sequence, which already includes version in name */
 {
 struct dnaSeq *seq = hGenBankGetMrnaC(conn, rsvi->acc, NULL);
 if (seq == NULL)
     errAbort("failed to get %s from database", rsvi->acc);
 faWriteNext(fh, seq->name, seq->dna, seq->size);
 dnaSeqFree(&seq);
 }
 
 static void getRnaSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request RNA sequences from database */
 {
 FILE *fh = mustOpen(outFile, "w");
 struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     processRnaSeq(fh, conn, hel->val);
     }
 carefulClose(&fh);
 }
 
 static void processProtSeq(FILE *fh, struct sqlConnection *conn, struct refSeqVerInfo *rsvi, struct hash *doneProts)
 /* get an protein sequence, which already includes version in name. Don't duplicate NPs */
 {
 char query[128];
 safef(query, sizeof(query), "SELECT protAcc FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
 char *protAcc = sqlNeedQuickString(conn, query);
 if (isNotEmpty(protAcc) && hashLookup(doneProts, protAcc) == NULL)
     {
     struct dnaSeq *seq = hGenBankGetPepC(conn, protAcc, NULL);
     if (seq == NULL)
         errAbort("failed to get %s from database", protAcc);
     faWriteNext(fh, seq->name, seq->dna, seq->size);
     dnaSeqFree(&seq);
     hashAdd(doneProts, protAcc, NULL);
     }
 freeMem(protAcc);
 }
 
 static void getProtSeqs(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request prot sequences from database */
 {
 struct hash *doneProts = hashNew(16);
 FILE *fh = mustOpen(outFile, "w");
 struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     processProtSeq(fh, conn, hel->val, doneProts);
     }
 carefulClose(&fh);
 hashFree(&doneProts);
 }
 
 static char *getProtAccVerIf(struct sqlConnection *conn, char *mrnaAcc, char *protAcc, char *buf, int bufSize)
 /* if protAcc is not empty, get acc.version, otherwise return empty */
 {
 if (isEmpty(protAcc))
     return "";
 else
     {
     int protVer = refSeqVerInfoGetVersion(protAcc, conn);
     if (protVer == 0)
         errAbort("no protein version entry for %s associated with %s", protAcc, mrnaAcc);
     safef(buf, bufSize, "%s.%d", protAcc, protVer);
     return buf;
     }
 }
 
+static char *getCds(struct sqlConnection *conn, char *acc)
+/* get CDS for an NM, results should be freed */
+{
+char query[128];
+safef(query, sizeof(query), "SELECT cds.name FROM gbCdnaInfo,cds WHERE (gbCdnaInfo.acc = \"%s\") AND (gbCdnaInfo.cds = cds.id)", acc);
+return sqlNeedQuickString(conn, query);
+}
+
 static void processMetaData(FILE *fh, struct sqlConnection *conn, struct sqlConnection *conn2, struct refSeqVerInfo *rsvi)
 /* get meta data for an accession */
 {
+boolean isCoding = genbankIsRefSeqCodingMRnaAcc(rsvi->acc);
 char query[128];
 safef(query, sizeof(query), "SELECT name,product,protAcc,locusLinkId FROM refLink WHERE mrnaAcc = \"%s\"", rsvi->acc);
 struct sqlResult *sr = sqlGetResult(conn, query);
 char **row = sqlNextRow(sr);
 if (row == NULL)
     errAbort("no RefLink entry for %s", rsvi->acc);
 char buf[32];
 char *protAccVer = getProtAccVerIf(conn2, rsvi->acc, row[2], buf, sizeof(buf));
-fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], row[1]);
+char *cds = isCoding ? getCds(conn2, rsvi->acc) : "";
+fprintf(fh, "%s.%d\t%s\t%s\t%s\t%s\t%s\n", rsvi->acc, rsvi->ver, protAccVer, row[0], row[3], cds, row[1]);
 sqlFreeResult(&sr);
+if (isCoding)
+    freeMem(cds);
 }
 
 static void getMetaData(struct sqlConnection *conn, struct hash *refSeqVerInfoTbl, char *outFile)
 /* get request prot sequences from database */
 {
 struct sqlConnection *conn2 = sqlConnect(sqlGetDatabase(conn));
-static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "product\n";
+static char *hdr = "#mrnaAcc\t" "protAcc\t" "geneName\t" "ncbiGeneId\t" "cds\t" "product\n";
 FILE *fh = mustOpen(outFile, "w");
 fputs(hdr, fh);
 struct hashCookie cookie = hashFirst(refSeqVerInfoTbl);
 struct hashEl *hel;
 while ((hel = hashNext(&cookie)) != NULL)
     {
     processMetaData(fh, conn, conn2, hel->val);
     }
 carefulClose(&fh);
 sqlDisconnect(&conn2);
 }
 
 static void refSeqGet(char *db, char *aligns, char *geneAnns, char *rnaSeqs, char *protSeqs,
                       char *metaData, boolean getNM, boolean getNR, char *accList)
 /* refSeqGet - retrieve refseq data from the database. */
 {
 struct sqlConnection *conn = sqlConnect(db);
 struct hash *refSeqVerInfoTbl = (accList == NULL)
     ? refSeqVerInfoFromDb(conn, getNM, getNR)
     : refSeqVerInfoFromFile(conn, accList);
 if (aligns != NULL)
     getAligns(conn, refSeqVerInfoTbl, aligns);
 if (geneAnns != NULL)
     getGeneAnns(conn, refSeqVerInfoTbl, geneAnns);
 if (rnaSeqs != NULL)
     getRnaSeqs(conn, refSeqVerInfoTbl, rnaSeqs);
 if (protSeqs != NULL)
     getProtSeqs(conn, refSeqVerInfoTbl, protSeqs);
 if (metaData != NULL)
     getMetaData(conn, refSeqVerInfoTbl, metaData);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 2)
     usage();
 
 if (optionExists("accList"))
     {
     if (optionExists("getNM"))
         errAbort("can't specify both -accList and -getNM");
     if (optionExists("getNR"))
         errAbort("can't specify both -accList and -getNR");
     }
 boolean getNM = optionExists("getNM");
 boolean getNR = optionExists("getNR");
 if (!(getNM || getNR))
     getNM = getNR = TRUE;
 
 refSeqGet(argv[1],
           optionVal("aligns", NULL),
           optionVal("geneAnns", NULL),
           optionVal("rnaSeqs", NULL),
           optionVal("protSeqs", NULL),
           optionVal("metaData", NULL),
           getNM, getNR,
           optionVal("accList", NULL));
 return 0;
 }