src/hg/snp/checkBigDbSnp/checkBigDbSnp.c 54452ec022a6073410955c04e110a1784f71fb57

54452ec022a6073410955c04e110a1784f71fb57
angie
  Wed Nov 13 17:37:34 2019 -0800
dbSnp153: add new ucscNote otherMapErr for mappings with the same rs# as a mapping w/inconsistent SPDI in BadCoords/Map Err subtrack.  refs #23283

diff --git src/hg/snp/checkBigDbSnp/checkBigDbSnp.c src/hg/snp/checkBigDbSnp/checkBigDbSnp.c
index c926e60..5e5b6c0 100644
--- src/hg/snp/checkBigDbSnp/checkBigDbSnp.c
+++ src/hg/snp/checkBigDbSnp/checkBigDbSnp.c
@@ -3,85 +3,84 @@
 /* Copyright (C) 2019 The Regents of the University of California */
 
 #include "common.h"
 #include "bigDbSnp.h"
 #include "dnautil.h"
 #include "indelShift.h"
 #include "linefile.h"
 #include "hash.h"
 #include "options.h"
 #include "twoBit.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
-  "checkBigDbSnp - Add ucscNotes for overlapping items and look for clustering errors\n"
+  "checkBigDbSnp - Add ucscNotes for overlapping items, clustering errors, mapping errors\n"
   "usage:\n"
   "   checkBigDbSnp hgNN.dbSnpMMM.bigDbSnp hgNN.2bit hgNN.dbSnpMMM.checked.bigDbSnp\n"
-//  "\n"
-//  "options:\n"
-//  "   -xxx=XXX\n"
+  "\n"
+  "options:\n"
+  "   -mapErrIds=file.txt    file.txt contains one rs# ID per line that had a mapping error.\n"
+  "                          Add note otherSeqMapErr if current rs is found in file.\n"
+  "\n"
   );
 }
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
+    {"mapErrIds", OPTION_STRING},
     {NULL, 0},
 };
 
-//#*** Not sure if we need to cache chrom sequence anymore...
-static void cacheChromSeq(struct seqWindow **pSeqWin, char *chrom, char *twoBitFile)
-/* Set seqWin to cover chrom, if it doesn't already. */
-{
-if (*pSeqWin == NULL || differentString(chrom, (*pSeqWin)->seqName))
-    {
-    verbose(2, "Caching %s...", chrom);
-    if (verboseLevel() >= 2)
-        fflush(stderr);
-    twoBitSeqWindowFree(pSeqWin);
-    *pSeqWin = twoBitSeqWindowNew(twoBitFile, chrom, 0, 0);
-    verbose(2, "(%d bases)\n", (*pSeqWin)->end - (*pSeqWin)->start);
-    }
-}
-
 static boolean bigDbSnpOverlap(struct bigDbSnp *a, struct bigDbSnp *b)
 /* Return TRUE if a and b overlap; special treatment for 0-length insertions. */
 {
 if (differentString(a->chrom, b->chrom))
     return FALSE;
 boolean aIsIns = (a->chromStart == a->chromEnd);
 boolean bIsIns = (b->chromStart == b->chromEnd);
 if (aIsIns && bIsIns)
     return (a->chromStart == b->chromStart);
 int aStart = a->chromStart, aEnd = a->chromEnd;
 int bStart = b->chromStart, bEnd = b->chromEnd;
 if (aIsIns)
     {
     aStart--;
     aEnd++;
     }
 if (bIsIns)
     {
     bStart--;
     bEnd++;
     }
 return (aStart < bEnd && bStart < aEnd);
 }
 
+static void checkMapErrIds(struct bigDbSnp *bds, struct hash *mapErrIdHash)
+/* If rs# ID is found in mapErrIdHash, add ucscNote otherMapErr. */
+{
+if (hashIntValDefault(mapErrIdHash, bds->name, 0))
+    {
+    struct dyString *dy = dyStringCreate("%s%s,", bds->ucscNotes, bdsOtherMapErr);
+    freeMem(bds->ucscNotes);
+    bds->ucscNotes = dyStringCannibalize(&dy);
+    }
+}
+
 static void queueAdd(struct bigDbSnp **pQ, struct bigDbSnp *item)
-/* Add item to FIFO queue of overlapping items */
+/* Add item to FIFO queue of overlapping items. */
 {
 // slAddTail is not the fastest, but I expect the Q to be empty most of the time, and to very
 // rarely have more than 3 items.
 slAddTail(pQ, item);
 }
 
 static void queueFlush(struct bigDbSnp **pQ, struct bigDbSnp *newItem, FILE *outF)
 /* If any items at the head of Q do not overlap newItem then pop them from the queue,
  * print them out and free them.  If newItem is NULL then flush everything. */
 {
 struct bigDbSnp *bds, *nextBds;
 for (bds = *pQ;  bds != NULL;  bds = nextBds)
     {
     nextBds = bds->next;
     if (newItem && bigDbSnpOverlap(bds, newItem))
@@ -138,54 +137,80 @@
         appendNoteIfNecessary(newItem, bdsOverlapDiffClass, dy);
         }
     else if (bds->chromStart == newItem->chromStart && bds->chromEnd == newItem->chromEnd)
         {
         appendNoteIfNecessary(bds, bdsClusterError, dy);
         appendNoteIfNecessary(newItem, bdsClusterError, dy);
         }
     else
         {
         appendNoteIfNecessary(bds, bdsOverlapSameClass, dy);
         appendNoteIfNecessary(newItem, bdsOverlapSameClass, dy);
         }
     }
 }
 
+struct hash *maybeReadMapErrIds()
+/* If -mapErrIds was given, open the file and read rs# IDs into a hash. */
+{
+struct hash *mapErrIdHash = NULL;
+char *mapErrIds = optionVal("mapErrIds", NULL);
+if (mapErrIds)
+    {
+    struct lineFile *lf = lineFileOpen(mapErrIds, TRUE);
+    char *line = NULL;
+    mapErrIdHash = hashNew(18);
+    while (lineFileNextReal(lf, &line))
+        {
+        if (!startsWith("rs", line))
+            lineFileAbort(lf, "-mapErrIds: expected one rs# ID per line, got '%s'", line);
+        char *p;
+        for (p = line+2;  *p != 0;  p++)
+            {
+            if (! isdigit(*p))
+                lineFileAbort(lf, "-mapErrIds: expected one rs# ID per line, got '%s'", line);
+            }
+        hashAddInt(mapErrIdHash, line, 1);
+        }
+    lineFileClose(&lf);
+    }
+return mapErrIdHash;
+}
+
 void checkBigDbSnp(char *bigDbSnpFile, char *twoBitFile, char *outFile)
 /* checkBigDbSnp - Flag overlapping items and look for clustering errors. */
 {
 struct lineFile *lf = lineFileOpen(bigDbSnpFile, TRUE);
 FILE *outF = mustOpen(outFile, "w");
-struct seqWindow *seqWin = NULL;
+struct hash *mapErrIdHash = maybeReadMapErrIds();
 struct bigDbSnp *overlapQueue = NULL;
 struct dyString *dy = dyStringNew(0);
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     char *row[BIGDBSNP_NUM_COLS];
     int wordCount = chopTabs(line, row);
     lineFileExpectWords(lf, BIGDBSNP_NUM_COLS, wordCount);
     struct bigDbSnp *bds = bigDbSnpLoad(row);
 
-    //#*** Not sure if we need to cache chrom sequence anymore...
-    cacheChromSeq(&seqWin, bds->chrom, twoBitFile);
+    if (mapErrIdHash)
+        checkMapErrIds(bds, mapErrIdHash);
 
     // check for overlapping items
     queueFlush(&overlapQueue, bds, outF);
     queueAdd(&overlapQueue, bds);
     checkOverlap(overlapQueue, dy);
     }
 queueFlush(&overlapQueue, NULL, outF);
 lineFileClose(&lf);
-twoBitSeqWindowFree(&seqWin);
 carefulClose(&outF);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 4)
     usage();
 checkBigDbSnp(argv[1], argv[2], argv[3]);
 return 0;
 }