src/hg/utils/refSeqGet/refSeqVerInfo.c 1.1
1.1 2009/11/23 02:56:20 markd
added program to get consistent versions of refseq data from database
Index: src/hg/utils/refSeqGet/refSeqVerInfo.c
===================================================================
RCS file: src/hg/utils/refSeqGet/refSeqVerInfo.c
diff -N src/hg/utils/refSeqGet/refSeqVerInfo.c
--- /dev/null 1 Jan 1970 00:00:00 -0000
+++ src/hg/utils/refSeqGet/refSeqVerInfo.c 23 Nov 2009 02:56:20 -0000 1.1
@@ -0,0 +1,152 @@
+/* object to track refseqs that are to be retrieved */
+#include "common.h"
+#include "refSeqVerInfo.h"
+#include "linefile.h"
+#include "hash.h"
+#include "jksql.h"
+#include "sqlNum.h"
+
+enum refSeqVerInfoStatus
+/* validation status */
+{
+ refSeqVerInfoOk,
+ refSeqVerInfoIgnore, // duplicate, ignore
+ refSeqVerInfoError
+};
+
+static struct refSeqVerInfo *refSeqVerInfoNewDb(char **row)
+/* construct a refSeqVerInfo object from a database query */
+{
+struct refSeqVerInfo *rsvi;
+AllocVar(rsvi);
+rsvi->acc = cloneString(row[0]);
+rsvi->ver = sqlSigned(row[1]);
+return rsvi;
+}
+
+struct hash *refSeqVerInfoFromDb(struct sqlConnection *conn, boolean getNM, boolean getNR)
+/* load refSeqVerInfo table for all native refseqs in the database */
+{
+struct hash *refSeqVerInfoTbl = hashNew(18);
+char *restrict = "";
+if (getNM && !getNR)
+ restrict = " AND (acc LIKE \"NM%\")";
+else if (!getNM && getNR)
+ restrict = " AND (acc LIKE \"NR%\")";
+char query[128];
+safef(query, sizeof(query),
+ "SELECT acc, version FROM gbStatus WHERE (srcDb = \"RefSeq\") AND (orgCat = \"native\")%s", restrict);
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+ {
+ struct refSeqVerInfo *rsvi = refSeqVerInfoNewDb(row);
+ hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
+ }
+
+sqlFreeResult(&sr);
+return refSeqVerInfoTbl;
+}
+
+static struct refSeqVerInfo *refSeqVerInfoNewFile(char *acc)
+/* construct a refSeqVerInfo object from a file */
+{
+struct refSeqVerInfo *rsvi;
+AllocVar(rsvi);
+char *dot = strchr(acc, '.');
+rsvi->acc = cloneStringZ(acc, ((dot != NULL) ? (dot - acc) : strlen(acc)));
+if (dot != NULL)
+ rsvi->requestVer = sqlUnsigned(dot+1);
+return rsvi;
+}
+
+static void prAccReqVer(struct refSeqVerInfo *rsvi, FILE *fh)
+/* print acc + requested version, or just acc if no version */
+{
+fputs(rsvi->acc, fh);
+if (rsvi->requestVer != 0)
+ fprintf(fh, ".%d", rsvi->requestVer);
+}
+
+static enum refSeqVerInfoStatus dupCheck(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+{
+struct refSeqVerInfo *rsvi0 = hashFindVal(refSeqVerInfoTbl, rsvi->acc);
+if (rsvi0 != NULL)
+ {
+ if (rsvi0->requestVer == rsvi->requestVer)
+ return refSeqVerInfoIgnore; // already there with the same version, ignore
+ else
+ {
+ fprintf(stderr, "Error: RefSeq %s specified multiple times in accList, once as ", rsvi->acc);
+ prAccReqVer(rsvi0, stderr);
+ fputs(" and once as ", stderr);
+ prAccReqVer(rsvi0, stderr);
+ fputc('\n', stderr);
+ return refSeqVerInfoError;
+ }
+ }
+else
+ return refSeqVerInfoOk;
+}
+
+int refSeqVerInfoGetVersion(char *acc, struct sqlConnection *conn)
+/* get the version from the database, or zero if accession is not found */
+{
+char query[128];
+safef(query, sizeof(query),
+ "SELECT version FROM gbSeq WHERE (acc = \"%s\")", acc);
+return sqlQuickNum(conn, query);
+}
+
+static enum refSeqVerInfoStatus versionGetCheck(struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+/* get or validate the version from the file against the database */
+{
+int dbVer = refSeqVerInfoGetVersion(rsvi->acc, conn);
+if (dbVer == 0)
+ {
+ fprintf(stderr, "Error: RefSeq %s not in database\n", rsvi->acc);
+ return refSeqVerInfoError;
+ }
+if ((rsvi->requestVer != 0) && (dbVer != rsvi->requestVer))
+ {
+ fprintf(stderr, "Error: RefSeq %s.%d requested in accList, database contains %s.%d \n", rsvi->acc, rsvi->requestVer, rsvi->acc, dbVer);
+ return refSeqVerInfoError;
+ }
+rsvi->ver = dbVer;
+return refSeqVerInfoOk;
+}
+
+static enum refSeqVerInfoStatus fromFileAdd(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+/* add a refseq parsed from a file to the table, validating against database
+ * and setting the actually version number. */
+{
+enum refSeqVerInfoStatus stat = dupCheck(refSeqVerInfoTbl, rsvi, conn);
+if (stat != refSeqVerInfoOk)
+ return stat;
+stat = versionGetCheck(rsvi, conn);
+if (stat != refSeqVerInfoOk)
+ return stat;
+hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
+return refSeqVerInfoOk;
+}
+
+struct hash *refSeqVerInfoFromFile(struct sqlConnection *conn, char *accList)
+/* load refSeqVerInfo table for all native refseqs specified in a file, then validate it against
+ * the database. */
+{
+struct hash *refSeqVerInfoTbl = hashNew(18);
+struct lineFile *lf = lineFileOpen(accList, TRUE);
+int errCnt = 0;
+char **row;
+while (lineFileNextReal(lf, row))
+ {
+ if (fromFileAdd(refSeqVerInfoTbl, refSeqVerInfoNewFile(row[0]), conn) == refSeqVerInfoError)
+ errCnt++;
+ }
+lineFileClose(&lf);
+if (errCnt > 0)
+ errAbort("%d errors detected loading RefSeq accessioned from %s", errCnt, accList);
+return refSeqVerInfoTbl;
+}
+
+