src/hg/utils/refSeqGet/refSeqVerInfo.c 1.1

1.1 2009/11/23 02:56:20 markd
added program to get consistent versions of refseq data from database
Index: src/hg/utils/refSeqGet/refSeqVerInfo.c
===================================================================
RCS file: src/hg/utils/refSeqGet/refSeqVerInfo.c
diff -N src/hg/utils/refSeqGet/refSeqVerInfo.c
--- /dev/null	1 Jan 1970 00:00:00 -0000
+++ src/hg/utils/refSeqGet/refSeqVerInfo.c	23 Nov 2009 02:56:20 -0000	1.1
@@ -0,0 +1,152 @@
+/* object to track refseqs that are to be retrieved */
+#include "common.h"
+#include "refSeqVerInfo.h"
+#include "linefile.h"
+#include "hash.h"
+#include "jksql.h"
+#include "sqlNum.h"
+
+enum refSeqVerInfoStatus
+/* validation status */
+{
+    refSeqVerInfoOk,
+    refSeqVerInfoIgnore,  // duplicate, ignore
+    refSeqVerInfoError
+};
+
+static struct refSeqVerInfo *refSeqVerInfoNewDb(char **row)
+/* construct a refSeqVerInfo object from a database query */
+{
+struct refSeqVerInfo *rsvi;
+AllocVar(rsvi);
+rsvi->acc = cloneString(row[0]);
+rsvi->ver = sqlSigned(row[1]);
+return rsvi;
+}
+
+struct hash *refSeqVerInfoFromDb(struct sqlConnection *conn, boolean getNM, boolean getNR)
+/* load refSeqVerInfo table for all native refseqs in the database */
+{
+struct hash *refSeqVerInfoTbl = hashNew(18);
+char *restrict = "";
+if (getNM && !getNR)
+    restrict = " AND (acc LIKE \"NM%\")";
+else if (!getNM && getNR)
+    restrict = " AND (acc LIKE \"NR%\")";
+char query[128];
+safef(query, sizeof(query),
+      "SELECT acc, version FROM gbStatus WHERE (srcDb = \"RefSeq\") AND (orgCat = \"native\")%s", restrict);
+struct sqlResult *sr = sqlGetResult(conn, query);
+char **row;
+while ((row = sqlNextRow(sr)) != NULL)
+    {
+    struct refSeqVerInfo *rsvi = refSeqVerInfoNewDb(row);
+    hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
+    }
+
+sqlFreeResult(&sr);
+return refSeqVerInfoTbl;
+}
+
+static struct refSeqVerInfo *refSeqVerInfoNewFile(char *acc)
+/* construct a refSeqVerInfo object from a file */
+{
+struct refSeqVerInfo *rsvi;
+AllocVar(rsvi);
+char *dot = strchr(acc, '.');
+rsvi->acc = cloneStringZ(acc, ((dot != NULL) ? (dot - acc) : strlen(acc)));
+if (dot != NULL)
+    rsvi->requestVer = sqlUnsigned(dot+1);
+return rsvi;
+}
+
+static void prAccReqVer(struct refSeqVerInfo *rsvi, FILE *fh)
+/* print acc + requested version, or just acc if no version */
+{
+fputs(rsvi->acc, fh);
+if (rsvi->requestVer != 0)
+    fprintf(fh, ".%d", rsvi->requestVer);
+}
+
+static enum refSeqVerInfoStatus dupCheck(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+{
+struct refSeqVerInfo *rsvi0 = hashFindVal(refSeqVerInfoTbl, rsvi->acc);
+if (rsvi0 != NULL)
+    {
+    if (rsvi0->requestVer == rsvi->requestVer)
+        return refSeqVerInfoIgnore;  // already there with the same version, ignore
+    else
+        {
+        fprintf(stderr, "Error: RefSeq %s specified multiple times in accList, once as ", rsvi->acc);
+        prAccReqVer(rsvi0, stderr);
+        fputs(" and once as ", stderr);
+        prAccReqVer(rsvi0, stderr);
+        fputc('\n', stderr);
+        return refSeqVerInfoError;
+        }
+    }
+else
+    return refSeqVerInfoOk;
+}
+
+int refSeqVerInfoGetVersion(char *acc, struct sqlConnection *conn)
+/* get the version from the database, or zero if accession is not found */
+{
+char query[128];
+safef(query, sizeof(query),
+      "SELECT version FROM gbSeq WHERE (acc = \"%s\")", acc);
+return sqlQuickNum(conn, query);
+}
+
+static enum refSeqVerInfoStatus versionGetCheck(struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+/* get or validate the version from the file against the database */
+{
+int dbVer = refSeqVerInfoGetVersion(rsvi->acc, conn);
+if (dbVer == 0)
+    {
+    fprintf(stderr, "Error: RefSeq %s not in database\n", rsvi->acc);
+    return refSeqVerInfoError;
+    }
+if ((rsvi->requestVer != 0) && (dbVer != rsvi->requestVer))
+    {
+    fprintf(stderr, "Error: RefSeq %s.%d requested in accList, database contains %s.%d \n", rsvi->acc, rsvi->requestVer, rsvi->acc, dbVer);
+    return refSeqVerInfoError;
+    }
+rsvi->ver = dbVer;
+return refSeqVerInfoOk;
+}
+
+static enum refSeqVerInfoStatus fromFileAdd(struct hash *refSeqVerInfoTbl, struct refSeqVerInfo *rsvi, struct sqlConnection *conn)
+/* add a refseq parsed from a file to the table, validating against database
+ * and setting the actually version number. */
+{
+enum refSeqVerInfoStatus stat = dupCheck(refSeqVerInfoTbl, rsvi, conn);
+if (stat != refSeqVerInfoOk)
+    return stat;
+stat = versionGetCheck(rsvi, conn);
+if (stat != refSeqVerInfoOk)
+    return stat;
+hashAdd(refSeqVerInfoTbl, rsvi->acc, rsvi);
+return refSeqVerInfoOk;
+}
+
+struct hash *refSeqVerInfoFromFile(struct sqlConnection *conn, char *accList)
+/* load refSeqVerInfo table for all native refseqs specified in a file, then validate it against
+ * the database. */
+{
+struct hash *refSeqVerInfoTbl = hashNew(18);
+struct lineFile *lf = lineFileOpen(accList, TRUE); 
+int errCnt = 0;
+char **row;
+while (lineFileNextReal(lf, row))
+    {
+    if (fromFileAdd(refSeqVerInfoTbl, refSeqVerInfoNewFile(row[0]), conn) == refSeqVerInfoError)
+        errCnt++;
+    }
+lineFileClose(&lf);
+if (errCnt > 0)
+    errAbort("%d errors detected loading RefSeq accessioned from %s", errCnt, accList);
+return refSeqVerInfoTbl;
+}
+
+