428a47dfc975f02f3c3a97b3273d7b95f9789ff4
angie
  Tue Oct 1 13:12:08 2019 -0700
New util dbSnpJsonToTab converts JSON to bigDbSnp and dbSnpDetails files; checkBigDbSnp flags clustering anomalies.  refs #23283

diff --git src/hg/snp/checkBigDbSnp/checkBigDbSnp.c src/hg/snp/checkBigDbSnp/checkBigDbSnp.c
new file mode 100644
index 0000000..c926e60
--- /dev/null
+++ src/hg/snp/checkBigDbSnp/checkBigDbSnp.c
@@ -0,0 +1,191 @@
+/* checkBigDbSnp - Flag overlapping items and look for clustering errors. */
+
+/* Copyright (C) 2019 The Regents of the University of California */
+
+#include "common.h"
+#include "bigDbSnp.h"
+#include "dnautil.h"
+#include "indelShift.h"
+#include "linefile.h"
+#include "hash.h"
+#include "options.h"
+#include "twoBit.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "checkBigDbSnp - Add ucscNotes for overlapping items and look for clustering errors\n"
+  "usage:\n"
+  "   checkBigDbSnp hgNN.dbSnpMMM.bigDbSnp hgNN.2bit hgNN.dbSnpMMM.checked.bigDbSnp\n"
+//  "\n"
+//  "options:\n"
+//  "   -xxx=XXX\n"
+  );
+}
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+   {NULL, 0},
+};
+
+//#*** Not sure if we need to cache chrom sequence anymore...
+static void cacheChromSeq(struct seqWindow **pSeqWin, char *chrom, char *twoBitFile)
+/* Set seqWin to cover chrom, if it doesn't already. */
+{
+if (*pSeqWin == NULL || differentString(chrom, (*pSeqWin)->seqName))
+    {
+    verbose(2, "Caching %s...", chrom);
+    if (verboseLevel() >= 2)
+        fflush(stderr);
+    twoBitSeqWindowFree(pSeqWin);
+    *pSeqWin = twoBitSeqWindowNew(twoBitFile, chrom, 0, 0);
+    verbose(2, "(%d bases)\n", (*pSeqWin)->end - (*pSeqWin)->start);
+    }
+}
+
+static boolean bigDbSnpOverlap(struct bigDbSnp *a, struct bigDbSnp *b)
+/* Return TRUE if a and b overlap; special treatment for 0-length insertions. */
+{
+if (differentString(a->chrom, b->chrom))
+    return FALSE;
+boolean aIsIns = (a->chromStart == a->chromEnd);
+boolean bIsIns = (b->chromStart == b->chromEnd);
+if (aIsIns && bIsIns)
+    return (a->chromStart == b->chromStart);
+int aStart = a->chromStart, aEnd = a->chromEnd;
+int bStart = b->chromStart, bEnd = b->chromEnd;
+if (aIsIns)
+    {
+    aStart--;
+    aEnd++;
+    }
+if (bIsIns)
+    {
+    bStart--;
+    bEnd++;
+    }
+return (aStart < bEnd && bStart < aEnd);
+}
+
+static void queueAdd(struct bigDbSnp **pQ, struct bigDbSnp *item)
+/* Add item to FIFO queue of overlapping items */
+{
+// slAddTail is not the fastest, but I expect the Q to be empty most of the time, and to very
+// rarely have more than 3 items.
+slAddTail(pQ, item);
+}
+
+static void queueFlush(struct bigDbSnp **pQ, struct bigDbSnp *newItem, FILE *outF)
+/* If any items at the head of Q do not overlap newItem then pop them from the queue,
+ * print them out and free them.  If newItem is NULL then flush everything. */
+{
+struct bigDbSnp *bds, *nextBds;
+for (bds = *pQ;  bds != NULL;  bds = nextBds)
+    {
+    nextBds = bds->next;
+    if (newItem && bigDbSnpOverlap(bds, newItem))
+        break;
+    else
+        {
+        struct bigDbSnp *popped = slPopHead(pQ);
+        if (popped != bds)
+            {
+            if (popped == NULL)
+                errAbort("queueFlush: Expected to pop %s %s:%d:%d but popped NULL",
+                         bds->name, bds->chrom, bds->chromStart, bds->chromEnd);
+            else
+                errAbort("queueFlush: Expected to pop %s %s:%d:%d but popped %s %s:%d:%d",
+                         bds->name, bds->chrom, bds->chromStart, bds->chromEnd,
+                         popped->name, popped->chrom, popped->chromStart, popped->chromEnd);
+            }
+        bigDbSnpTabOut(bds, outF);
+        bigDbSnpFree(&bds);
+        }
+    }
+}
+
+static void appendNoteIfNecessary(struct bigDbSnp *bds, char *note, struct dyString *dy)
+/* If bds->ucscNotes does not already include note, then add it using dy as scratch space. */
+{
+if (!stringIn(note, bds->ucscNotes))
+    {
+    dyStringClear(dy);
+    dyStringAppend(dy, bds->ucscNotes);
+    dyStringAppend(dy, note);
+    dyStringAppendC(dy, ',');
+    freeMem(bds->ucscNotes);
+    bds->ucscNotes = cloneString(dy->string);
+    }
+}
+
+static void checkOverlap(struct bigDbSnp *bdsList, struct dyString *dy)
+/* Add ucscNotes flags to overlapping items in bdsList. */
+{
+// Compare previously queued items vs. only the new item, because they have already had a turn
+// as the new item and have been compared to other queued overlapping items.
+struct bigDbSnp *newItem = slLastEl(bdsList);
+struct bigDbSnp *bds;
+for (bds = bdsList;  bds != newItem;  bds = bds->next)
+    {
+    if (!bigDbSnpOverlap(bds, newItem))
+        // Sometimes a large item holds up the queue a bit; skip over intervening non-overlapping
+        // items.
+        continue;
+    if (bds->class != newItem->class)
+        {
+        appendNoteIfNecessary(bds, bdsOverlapDiffClass, dy);
+        appendNoteIfNecessary(newItem, bdsOverlapDiffClass, dy);
+        }
+    else if (bds->chromStart == newItem->chromStart && bds->chromEnd == newItem->chromEnd)
+        {
+        appendNoteIfNecessary(bds, bdsClusterError, dy);
+        appendNoteIfNecessary(newItem, bdsClusterError, dy);
+        }
+    else
+        {
+        appendNoteIfNecessary(bds, bdsOverlapSameClass, dy);
+        appendNoteIfNecessary(newItem, bdsOverlapSameClass, dy);
+        }
+    }
+}
+
+void checkBigDbSnp(char *bigDbSnpFile, char *twoBitFile, char *outFile)
+/* checkBigDbSnp - Flag overlapping items and look for clustering errors. */
+{
+struct lineFile *lf = lineFileOpen(bigDbSnpFile, TRUE);
+FILE *outF = mustOpen(outFile, "w");
+struct seqWindow *seqWin = NULL;
+struct bigDbSnp *overlapQueue = NULL;
+struct dyString *dy = dyStringNew(0);
+char *line;
+while (lineFileNextReal(lf, &line))
+    {
+    char *row[BIGDBSNP_NUM_COLS];
+    int wordCount = chopTabs(line, row);
+    lineFileExpectWords(lf, BIGDBSNP_NUM_COLS, wordCount);
+    struct bigDbSnp *bds = bigDbSnpLoad(row);
+
+    //#*** Not sure if we need to cache chrom sequence anymore...
+    cacheChromSeq(&seqWin, bds->chrom, twoBitFile);
+
+    // check for overlapping items
+    queueFlush(&overlapQueue, bds, outF);
+    queueAdd(&overlapQueue, bds);
+    checkOverlap(overlapQueue, dy);
+    }
+queueFlush(&overlapQueue, NULL, outF);
+lineFileClose(&lf);
+twoBitSeqWindowFree(&seqWin);
+carefulClose(&outF);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 4)
+    usage();
+checkBigDbSnp(argv[1], argv[2], argv[3]);
+return 0;
+}