54452ec022a6073410955c04e110a1784f71fb57 angie Wed Nov 13 17:37:34 2019 -0800 dbSnp153: add new ucscNote otherMapErr for mappings with the same rs# as a mapping w/inconsistent SPDI in BadCoords/Map Err subtrack. refs #23283 diff --git src/hg/snp/checkBigDbSnp/checkBigDbSnp.c src/hg/snp/checkBigDbSnp/checkBigDbSnp.c index c926e60..5e5b6c0 100644 --- src/hg/snp/checkBigDbSnp/checkBigDbSnp.c +++ src/hg/snp/checkBigDbSnp/checkBigDbSnp.c @@ -3,85 +3,84 @@ /* Copyright (C) 2019 The Regents of the University of California */ #include "common.h" #include "bigDbSnp.h" #include "dnautil.h" #include "indelShift.h" #include "linefile.h" #include "hash.h" #include "options.h" #include "twoBit.h" void usage() /* Explain usage and exit. */ { errAbort( - "checkBigDbSnp - Add ucscNotes for overlapping items and look for clustering errors\n" + "checkBigDbSnp - Add ucscNotes for overlapping items, clustering errors, mapping errors\n" "usage:\n" " checkBigDbSnp hgNN.dbSnpMMM.bigDbSnp hgNN.2bit hgNN.dbSnpMMM.checked.bigDbSnp\n" -// "\n" -// "options:\n" -// " -xxx=XXX\n" + "\n" + "options:\n" + " -mapErrIds=file.txt file.txt contains one rs# ID per line that had a mapping error.\n" + " Add note otherSeqMapErr if current rs is found in file.\n" + "\n" ); } /* Command line validation table. */ static struct optionSpec options[] = { + {"mapErrIds", OPTION_STRING}, {NULL, 0}, }; -//#*** Not sure if we need to cache chrom sequence anymore... -static void cacheChromSeq(struct seqWindow **pSeqWin, char *chrom, char *twoBitFile) -/* Set seqWin to cover chrom, if it doesn't already. */ -{ -if (*pSeqWin == NULL || differentString(chrom, (*pSeqWin)->seqName)) - { - verbose(2, "Caching %s...", chrom); - if (verboseLevel() >= 2) - fflush(stderr); - twoBitSeqWindowFree(pSeqWin); - *pSeqWin = twoBitSeqWindowNew(twoBitFile, chrom, 0, 0); - verbose(2, "(%d bases)\n", (*pSeqWin)->end - (*pSeqWin)->start); - } -} - static boolean bigDbSnpOverlap(struct bigDbSnp *a, struct bigDbSnp *b) /* Return TRUE if a and b overlap; special treatment for 0-length insertions. */ { if (differentString(a->chrom, b->chrom)) return FALSE; boolean aIsIns = (a->chromStart == a->chromEnd); boolean bIsIns = (b->chromStart == b->chromEnd); if (aIsIns && bIsIns) return (a->chromStart == b->chromStart); int aStart = a->chromStart, aEnd = a->chromEnd; int bStart = b->chromStart, bEnd = b->chromEnd; if (aIsIns) { aStart--; aEnd++; } if (bIsIns) { bStart--; bEnd++; } return (aStart < bEnd && bStart < aEnd); } +static void checkMapErrIds(struct bigDbSnp *bds, struct hash *mapErrIdHash) +/* If rs# ID is found in mapErrIdHash, add ucscNote otherMapErr. */ +{ +if (hashIntValDefault(mapErrIdHash, bds->name, 0)) + { + struct dyString *dy = dyStringCreate("%s%s,", bds->ucscNotes, bdsOtherMapErr); + freeMem(bds->ucscNotes); + bds->ucscNotes = dyStringCannibalize(&dy); + } +} + static void queueAdd(struct bigDbSnp **pQ, struct bigDbSnp *item) -/* Add item to FIFO queue of overlapping items */ +/* Add item to FIFO queue of overlapping items. */ { // slAddTail is not the fastest, but I expect the Q to be empty most of the time, and to very // rarely have more than 3 items. slAddTail(pQ, item); } static void queueFlush(struct bigDbSnp **pQ, struct bigDbSnp *newItem, FILE *outF) /* If any items at the head of Q do not overlap newItem then pop them from the queue, * print them out and free them. If newItem is NULL then flush everything. */ { struct bigDbSnp *bds, *nextBds; for (bds = *pQ; bds != NULL; bds = nextBds) { nextBds = bds->next; if (newItem && bigDbSnpOverlap(bds, newItem)) @@ -138,54 +137,80 @@ appendNoteIfNecessary(newItem, bdsOverlapDiffClass, dy); } else if (bds->chromStart == newItem->chromStart && bds->chromEnd == newItem->chromEnd) { appendNoteIfNecessary(bds, bdsClusterError, dy); appendNoteIfNecessary(newItem, bdsClusterError, dy); } else { appendNoteIfNecessary(bds, bdsOverlapSameClass, dy); appendNoteIfNecessary(newItem, bdsOverlapSameClass, dy); } } } +struct hash *maybeReadMapErrIds() +/* If -mapErrIds was given, open the file and read rs# IDs into a hash. */ +{ +struct hash *mapErrIdHash = NULL; +char *mapErrIds = optionVal("mapErrIds", NULL); +if (mapErrIds) + { + struct lineFile *lf = lineFileOpen(mapErrIds, TRUE); + char *line = NULL; + mapErrIdHash = hashNew(18); + while (lineFileNextReal(lf, &line)) + { + if (!startsWith("rs", line)) + lineFileAbort(lf, "-mapErrIds: expected one rs# ID per line, got '%s'", line); + char *p; + for (p = line+2; *p != 0; p++) + { + if (! isdigit(*p)) + lineFileAbort(lf, "-mapErrIds: expected one rs# ID per line, got '%s'", line); + } + hashAddInt(mapErrIdHash, line, 1); + } + lineFileClose(&lf); + } +return mapErrIdHash; +} + void checkBigDbSnp(char *bigDbSnpFile, char *twoBitFile, char *outFile) /* checkBigDbSnp - Flag overlapping items and look for clustering errors. */ { struct lineFile *lf = lineFileOpen(bigDbSnpFile, TRUE); FILE *outF = mustOpen(outFile, "w"); -struct seqWindow *seqWin = NULL; +struct hash *mapErrIdHash = maybeReadMapErrIds(); struct bigDbSnp *overlapQueue = NULL; struct dyString *dy = dyStringNew(0); char *line; while (lineFileNextReal(lf, &line)) { char *row[BIGDBSNP_NUM_COLS]; int wordCount = chopTabs(line, row); lineFileExpectWords(lf, BIGDBSNP_NUM_COLS, wordCount); struct bigDbSnp *bds = bigDbSnpLoad(row); - //#*** Not sure if we need to cache chrom sequence anymore... - cacheChromSeq(&seqWin, bds->chrom, twoBitFile); + if (mapErrIdHash) + checkMapErrIds(bds, mapErrIdHash); // check for overlapping items queueFlush(&overlapQueue, bds, outF); queueAdd(&overlapQueue, bds); checkOverlap(overlapQueue, dy); } queueFlush(&overlapQueue, NULL, outF); lineFileClose(&lf); -twoBitSeqWindowFree(&seqWin); carefulClose(&outF); } int main(int argc, char *argv[]) /* Process command line. */ { optionInit(&argc, argv, options); if (argc != 4) usage(); checkBigDbSnp(argv[1], argv[2], argv[3]); return 0; }