src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c 428a47dfc975f02f3c3a97b3273d7b95f9789ff4

428a47dfc975f02f3c3a97b3273d7b95f9789ff4
angie
  Tue Oct 1 13:12:08 2019 -0700
New util dbSnpJsonToTab converts JSON to bigDbSnp and dbSnpDetails files; checkBigDbSnp flags clustering anomalies.  refs #23283

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
new file mode 100644
index 0000000..d8ea261
--- /dev/null
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -0,0 +1,1738 @@
+/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
+
+/* Copyright (C) 2019 The Regents of the University of California */
+
+#include "common.h"
+#include "basicBed.h"
+#include "bigDbSnp.h"
+#include "binRange.h"
+#include "dnaseq.h"
+#include "dnautil.h"
+#include "errCatch.h"
+#include "hash.h"
+#include "indelShift.h"
+#include "iupac.h"
+#include "jsonQuery.h"
+#include "linefile.h"
+#include "obscure.h"
+#include "options.h"
+#include "regexHelper.h"
+#include "soTerm.h"
+#include "twoBit.h"
+
+void usage()
+/* Explain usage and exit. */
+{
+errAbort(
+  "dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated track files\n"
+  "usage:\n"
+  "   dbSnpJsonToTab assemblyList refSeqToUcsc refsnp-chrN.json[.gz] outRoot\n"
+  "assemblyList is a comma-separated set of assembly_name values in JSON, e.g. 'GRCh37.p13,GRCh38.p12'\n"
+  "refSeqToUcsc is tab-sep, 4 columns: RefSeq acc (NC_...), assembly_name, twoBitFile, ucscName (chr...)\n"
+  "Track files starting with outRoot will be created:\n"
+  "  outRoot.<assembly>.bigDbSnp: BED4+ for main dbSNP track in each assembly in assemblyList\n"
+  "                               (final columns need to be added later by bedJoinTabOffset)\n"
+  "  outRootDetails.tab: tab-separated allele frequency counts and other details (for bedJoinTabOffset)\n"
+  "Additional files are written out for error reporting:\n"
+  "  outRootFailed.json: lines of JSON input that encountered an error, for debugging/reprocessing\n"
+  "  outRootMerged.tab: two columns, obsolete dbSNP rs# ID and current rs# ID\n"
+  "  outRootErrors.tab: lines of error info\n"
+  "\n"
+  "\n"
+  "options:\n"
+  "   -freqSourceOrder=LIST      LIST is an ordered, comma-sep list of project names (no spaces)\n"
+  "                              for frequency data submitted to dbSNP, e.g. '1000genomes,GnomAD'\n"
+  "                              If given, then the bigDbSnp minorAlleleFreq array uses the order.\n"
+  "                              Otherwise minorAlleleFreq contains only one frequency, from the\n"
+  "                              project with the highest sample count reported for that variant.\n"
+  "   -equivRegions=FILE         Each line of FILE contains space-separated acc:start:end regions\n"
+  "                              that are equivalent, for example PAR1 or PAR2 on X and Y,\n"
+  "                              or an alt sequence and the corresponding portion of its chrom.\n"
+  "                              These use RefSeq accesions not UCSC chr names, so that multiple\n"
+  "                              assemblies can be described in the same file.\n"
+  );
+}
+
+// Globals
+int freqSourceCount = 0;
+char **freqSourceOrder = NULL;
+struct hash *seqIsRc = NULL;
+struct hash *ncGrcToChrom = NULL;
+struct hash *ncToTwoBitChrom = NULL;
+struct hash *ncToSeqWin = NULL;
+
+/* Command line validation table. */
+static struct optionSpec options[] = {
+    {"freqSourceOrder", OPTION_STRING},
+    {"equivRegions", OPTION_STRING},
+    {NULL, 0},
+};
+
+struct outStreams
+/* Container for various output files that are not assembly-specific */
+    {
+    FILE *details;    // outRootDetails.tab
+    FILE *failJson;   // outRootFailed.json
+    FILE *merged;     // outRootMerged.tab
+    FILE *err;        // outRootErrors.tab
+    };
+
+static FILE *openOutFile(char *format, char *outRoot)
+/* Open a file for writing, named by format (which must have exactly one %s) and outRoot. */
+{
+struct dyString *name = dyStringCreate(format, outRoot);
+FILE *outF = mustOpen(name->string, "w");
+dyStringFree(&name);
+return outF;
+}
+
+static struct outStreams *outStreamsOpen(char *outRoot)
+/* Open output file handles */
+{
+struct outStreams *outStreams;
+AllocVar(outStreams);
+outStreams->details = openOutFile("%sDetails.tab", outRoot);
+outStreams->failJson = openOutFile("%sFailed.json", outRoot);
+outStreams->merged = openOutFile("%sMerged.tab", outRoot);
+outStreams->err = openOutFile("%sErrors.tab", outRoot);
+return outStreams;
+}
+
+static void outStreamsClose(struct outStreams **pOutStreams)
+/* Close file handles and free struct. */
+{
+if (*pOutStreams != NULL)
+    {
+    struct outStreams *outStreams = *pOutStreams;
+    carefulClose(&outStreams->details);
+    carefulClose(&outStreams->failJson);
+    carefulClose(&outStreams->merged);
+    carefulClose(&outStreams->err);
+    freez(pOutStreams);
+    }
+}
+
+struct spdiBed
+/* dbSNP 2.0 variant representation: {Sequence, Position, Deletion, Insertion}, expanded to
+ * cover ambiguous placement region for indels when applicable. Add chromEnd to get bed3+. */
+{
+    struct spdiBed *next;
+    char *chrom;           // Sequence ID
+    uint chromStart;       // Position: 0-based start
+    uint chromEnd;         // Position + number of deleted bases
+    char *del;             // Deleted sequence (starting at pos) -- or number of deleted bases
+    char *ins;             // Inserted sequence
+};
+
+struct alObs
+// Unit of variant allele frequency observation
+    {
+    struct alObs *next;
+    char *allele;   // Variant allele sequence
+    int obsCount;   // Number of chromosomes on which allele was observed by a project
+    int totalCount; // Number of chromosomes on which project observed some allele of this variant
+    };
+
+struct sharedProps
+/* Properties of a refsnp that are shared by all mappings to all assemblies. */
+    {
+    // Extracted from JSON:
+    char *name;                       // rs# ID
+    int freqSourceCount;              // Number of projects reporting allele frequency observations
+    struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs
+    struct alObs **freqSourceObs;     // Array of per-project observation lists
+    char **freqSourceMajorAl;         // Array of per-project major allele
+    char **freqSourceMinorAl;         // Array of per-project minor allele
+    double *freqSourceMaf;            // Array of per-project minor allele frequencies
+    int biggestSourceIx;              // index of freqSourceObs with highest totalCount
+    boolean *freqIsRc;                // Per-project flag for sequence isRc relative to ptlp
+    struct slInt *soTerms;            // Sequence Ontology functional effect terms
+    struct slName *submitters;        // dbSNP submitter handles
+    struct slInt *pubMedIds;          // PubMed article citations
+    struct slName *clinVarAccs;       // Accessions of ClinVar annotations, i.e. ClinVar RCV*
+    struct slName *clinVarSigs;       // Reported ClinVar significance
+    enum bigDbSnpClass class;         // {snv, mnv, ins, del, delins, identity}
+    enum soTerm maxFuncImpact;        // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic}
+    int commonCount;                  // Number of projects reporting minor allele freq >= 1%
+    int rareCount;                    // Number of projects reporting minor allele freq < 1%
+    };
+
+struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm)
+/* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm.
+ * Checking validity of alleles is up to caller, but this does check whether del is a number
+ * instead of sequence. */
+{
+struct spdiBed *spdi;
+lmAllocVar(lm, spdi);
+spdi->chrom = seq;
+spdi->chromStart = pos;
+spdi->del = del;
+spdi->ins = ins;
+if (isAllDigits(del))
+    spdi->chromEnd = pos + atoi(del);
+else
+    spdi->chromEnd = spdi->chromStart + strlen(del);
+return spdi;
+}
+
+static boolean seqIsNucleotide(const char *seq)
+/* Return TRUE unless seq contains a character that is not in the IUPAC nucleotide set. */
+{
+if (seq == NULL)
+    return FALSE;
+const char *p;
+for (p = seq;  *p != '\0';  p++)
+    {
+    if (!isIupac(*p))
+        return FALSE;
+    }
+return TRUE;
+}
+
+static struct spdiBed *parseSpdis(struct slRef *spdiList, char *name, struct lm *lm)
+/* Make sure that all SPDI objects have the consistent seq_id, deleted_sequence and position. */
+{
+struct spdiBed *spdiBList = NULL;
+char *firstSeq = NULL;
+int firstPos = -1;
+char *firstDel = NULL;
+struct slRef *spdiRef;
+for (spdiRef = spdiList;  spdiRef != NULL;  spdiRef = spdiRef->next)
+    {
+    struct jsonElement *spdiEl = spdiRef->val;
+    char *seq = jsonQueryString(spdiEl, "spdi", "seq_id", lm);
+    int pos = jsonQueryInt(spdiEl, "spdi", "position", -1, lm);
+    char *del = jsonQueryString(spdiEl, "spdi", "deleted_sequence", lm);
+    char *ins = jsonQueryString(spdiEl, "spdi", "inserted_sequence", lm);
+    if (seq == NULL)
+        errAbort("Missing or null seq_id in spdi for %s", name);
+    if (pos < 0)
+        errAbort("Invalid or missing position in spdi for %s", name);
+    if (del == NULL)
+        errAbort("Missing or null deleted_sequence in spdi for %s", name);
+    if (ins == NULL)
+        errAbort("Missing or null inserted_sequence in spdi for %s", name);
+    if (! seqIsNucleotide(del))
+        errAbort("Invalid deleted_sequence '%s' in spdi for %s", del, name);
+    if (! seqIsNucleotide(ins))
+        errAbort("Invalid inserted_sequence '%s' in spdi for %s", ins, name);
+    if (firstSeq == NULL)
+        {
+        // First spdi
+        firstSeq = seq;
+        firstDel = del;
+        firstPos = pos;
+        }
+    else
+        {
+        if (differentString(firstSeq, seq))
+            errAbort("Mismatching seq_id ('%s' vs '%s') in spdis for %s",
+                     firstSeq, seq, name);
+        if (differentString(firstDel, del))
+            errAbort("Mismatching deleted sequence ('%s' vs '%s') in %s spdis for %s",
+                     firstDel, del, seq, name);
+        if (firstPos != pos)
+            errAbort("Mismatching position (%d vs %d) in %s spdis for %s",
+                     firstPos, pos, seq, name);
+        }
+    slAddHead(&spdiBList, spdiBedNewLm(seq, pos, del, ins, lm));
+    }
+slReverse(&spdiBList);
+return spdiBList;
+}
+
+static enum bigDbSnpClass varTypeToClass(char *varType)
+/* Convert JSON variant_type to bigDbSnp enum form. */
+{
+if (sameString(varType, "snv"))
+    return bigDbSnpSnv;
+else if (sameString(varType, "mnv"))
+    return bigDbSnpMnv;
+else if (sameString(varType, "ins"))
+    return bigDbSnpIns;
+else if (sameString(varType, "del"))
+    return bigDbSnpDel;
+else if (sameString(varType, "delins"))
+    return bigDbSnpDelins;
+else if (sameString(varType, "identity"))
+    return bigDbSnpIdentity;
+else
+    errAbort("varTypeToClass: unrecognized variant_type '%s'.", varType);
+return bigDbSnpSnv;  // we never get here; appease gcc
+}
+
+static int freqSourceToIx(char *source, struct slName **pSourceList, struct lm *lm)
+/* Convert source name to index using freqSourceOrder if defined, otherwise sourceList
+ * (expand sourceList if source is not already in the list). */
+{
+int ix = -1;
+if (freqSourceOrder)
+    {
+    ix = stringArrayIx(source, freqSourceOrder, freqSourceCount);
+    if (ix < 0)
+        errAbort("freqSourceOrder does not contains '%s'", source);
+    }
+else
+    {
+    ix = slNameFindIx(*pSourceList, source);
+    if (ix < 0)
+        {
+        ix = slCount(*pSourceList);
+        slAddTail(pSourceList, lmSlName(lm, source));
+        }
+    }
+return ix;
+}
+
+static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource,
+                                struct spdiBed **retSpdiB, struct lm *lm)
+/* Parse one allele observation. */
+{
+struct alObs *obs = NULL;
+lmAllocVar(lm, obs);
+char *source = jsonQueryString(obsEl, "alObs", "study_name", lm);
+if (source == NULL)
+    errAbort("parseAlObs: Missing or null study_name");
+struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm);
+struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm);
+obs->allele = spdiB->ins;
+obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm);
+obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm);
+if (obs->obsCount < 0)
+    errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)",
+             source, obs->allele);
+if (obs->totalCount < 0)
+    errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)",
+             source, obs->allele);
+if (obs->totalCount == 0)
+    errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)",
+             source, obs->allele);
+*retSource = source;
+*retSpdiB = spdiB;
+return obs;
+}
+
+static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm)
+/* If the reference allele was omitted from frequency reports (dbSNP JIRA VR-178), infer the count
+ * and add the reference allele to lists of spdis and obs. */
+{
+int ix;
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    if (props->freqSourceSpdis[ix] == NULL)
+        continue;
+    boolean gotRef = FALSE;
+    int sumCounts = 0;
+    struct spdiBed *spdiB;
+    struct alObs *obs;
+    for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
+         spdiB != NULL && obs != NULL;
+         spdiB = spdiB->next, obs = obs->next)
+        {
+        if (sameString(spdiB->del, spdiB->ins))
+            {
+            gotRef = TRUE;
+            break;
+            }
+        sumCounts += obs->obsCount;
+        }
+    if (!gotRef)
+        {
+        struct spdiBed *firstSpdiB = props->freqSourceSpdis[ix];
+        struct alObs *firstObs = props->freqSourceObs[ix];
+        if (sumCounts > firstObs->totalCount)
+            errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref "
+                     "counts %d", rsId, ix, firstObs->totalCount, sumCounts);
+        struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart,
+                                             firstSpdiB->del, firstSpdiB->del, lm);
+        struct alObs *obs;
+        lmAllocVar(lm, obs);
+        obs->allele = spdiB->del;
+        obs->obsCount = firstObs->totalCount - sumCounts;
+        obs->totalCount = firstObs->totalCount;
+        slAddHead(&props->freqSourceSpdis[ix], spdiB);
+        slAddHead(&props->freqSourceObs[ix], obs);
+        }
+    }
+}
+
+static void checkFreqSourceObs(struct sharedProps *props, char *rsId)
+/* After frequency allele observations have been sorted into per-project lists,
+ * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent.
+ * The sum of obsCounts may be less than total_count (no-calls), but not greater. */
+{
+int ix;
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    int totalCount = 0;
+    struct alObs *obsList = props->freqSourceObs[ix], *obs;
+    for (obs = obsList;  obs != NULL;  obs = obs->next)
+        {
+        if (! seqIsNucleotide(obs->allele))
+            errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId);
+        if (obs->obsCount < 0)
+            errAbort("checkFreqSourceObs: invalid obsCount %d for %s", obs->obsCount, rsId);
+        if (obs->obsCount > obs->totalCount)
+            errAbort("checkFreqSourceObs: obsCount %d > totalCount %d for %s",
+                     obs->obsCount, obs->totalCount, rsId);
+        totalCount += obs->obsCount;
+        }
+    for (obs = obsList;  obs != NULL;  obs = obs->next)
+        {
+        if (obs->totalCount < totalCount)
+            errAbort("checkFreqSourceObs: obsCounts sum to %d, but total_count is %d for %s",
+                     totalCount, obs->totalCount, rsId);
+        else if (obs->totalCount != obsList->totalCount)
+            errAbort("checkFreqSourceObs: inconsistent totalCount (%d for '%s' != %d for '%s' "
+                     "for %s", obs->totalCount, obs->allele, obsList->totalCount, obsList->allele,
+                     rsId);
+        }
+    }
+}
+
+static void parseFrequencies(struct slRef *annotationsRef, struct sharedProps *props,
+                             char *rsId, struct lm *lm)
+/* Extract per-project allele frequency count data from JSON.  Note: the alleles are from the
+ * reference assembly that the project used; current reference might be different. */
+{
+struct slRef *obsList = jsonQueryElementList(annotationsRef, "annotations", "frequency[*]", lm);
+if (obsList != NULL)
+    {
+    if (freqSourceOrder)
+        {
+        props->freqSourceCount = freqSourceCount;
+        lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount);
+        lmAllocArray(lm, props->freqSourceObs, freqSourceCount);
+        lmAllocArray(lm, props->freqIsRc, freqSourceCount);
+        }
+    else
+        {
+        props->freqSourceCount = 0;
+        int allocSize = slCount(obsList);
+        lmAllocArray(lm, props->freqSourceSpdis, allocSize);
+        lmAllocArray(lm, props->freqSourceObs, allocSize);
+        lmAllocArray(lm, props->freqIsRc, allocSize);
+        }
+    struct slName *unorderedSourceList = NULL;
+    int maxSampleCount = 0;
+    int biggestSourceIx = 0;
+    struct slRef *obsRef;
+    for (obsRef = obsList;  obsRef != NULL;  obsRef = obsRef->next)
+        {
+        struct jsonElement *obsEl = obsRef->val;
+        char *source = NULL;
+        struct spdiBed *spdiB = NULL;
+        struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm);
+        int ix = freqSourceToIx(source, &unorderedSourceList, lm);
+        slAddHead(&props->freqSourceSpdis[ix], spdiB);
+        slAddHead(&props->freqSourceObs[ix], obs);
+        props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1);
+        if (props->freqIsRc[ix] < 0)
+            errAbort("parseFrequencies: Unknown orientation for %s seq_id %s",
+                     rsId, spdiB->chrom);
+        if (obs->totalCount > maxSampleCount)
+            {
+            maxSampleCount = obs->totalCount;
+            biggestSourceIx = ix;
+            }
+        }
+    if (!freqSourceOrder)
+        props->freqSourceCount = slCount(unorderedSourceList);
+    int ix;
+    for (ix = 0;  ix < props->freqSourceCount; ix++)
+        {
+        slReverse(&props->freqSourceSpdis[ix]);
+        slReverse(&props->freqSourceObs[ix]);
+        }
+    addMissingRefAllele(props, rsId, lm);
+    checkFreqSourceObs(props, rsId);
+    props->biggestSourceIx = biggestSourceIx;
+    }
+}
+
+static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm)
+/* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional
+ * impact, discard duplicates and return as slInt list with highest impact first. */
+{
+struct slInt *intList = NULL;
+if (soTermNames)
+    {
+    int termCount = slCount(soTermNames);
+    enum soTerm termArray[termCount];
+    struct slName *term;
+    int i;
+    for (i = 0, term = soTermNames;  term != NULL;  i++, term = term->next)
+        termArray[i] = soTermStringIdToId(term->name);
+    assert(i == termCount);
+    qsort(termArray, termCount, sizeof termArray[0], soTermCmp);
+    for (i = 0;  i < termCount; i++)
+        {
+        if (i == 0 || termArray[i] != termArray[i-1])
+            {
+            struct slInt *intEl;
+            lmAllocVar(lm, intEl);
+            intEl->val = termArray[i];
+            slAddHead(&intList, intEl);
+            }
+        }
+    slReverse(&intList);
+    }
+return intList;
+}
+
+struct seqWindow *getChromSeq(struct hash *ncToSeqWin, char *nc, struct hash *ncToTwoBitChrom)
+/* Look up nc in ncToSeqWin.  If not found, look it up in ncToTwoBitChrom, make a seqWin,
+ * store it in ncToSeqWin and return it.  Return NULL if neither hash has nc. */
+{
+struct seqWindow *seqWin = hashFindVal(ncToSeqWin, nc);
+if (seqWin == NULL)
+    {
+    char *twoBitChrom = hashFindVal(ncToTwoBitChrom, nc);
+    if (twoBitChrom != NULL)
+        {
+        struct dnaSeq *dnaSeq = twoBitLoadAll(twoBitChrom);
+        if (dnaSeq != NULL)
+            {
+            // Use memSeqWindow instead of twoBitSeqWindow to avoid keeping a bunch of open
+            // twoBitFile handles.
+            // Hmm, a dnaSeqWindow would be more efficient, no cloning and strlen...
+            // or at the least, memSeqWindowNew could take a size arg.
+            seqWin = memSeqWindowNew(twoBitChrom, dnaSeq->dna);
+            dnaSeqFree(&dnaSeq);
+            hashAdd(ncToSeqWin, nc, seqWin);
+            }
+        }
+    }
+return seqWin;
+}
+
+static boolean maybeExpandRange(struct spdiBed *spdiBTrim, struct seqWindow *seqWin,
+                                struct spdiBed *spdiB, struct alObs *obs, struct lm *lm)
+/* If spdiBTrim is a pure insertion or deletion within a repetitive region,
+ * then expand spdiB (and obs if non-NULL) to the full repetitive range and return TRUE.
+ * Trim spdiBTrim to minimal representation (see maybeTrimSpdi) before calling this. */
+{
+boolean changed = FALSE;
+uint refTrimStart = spdiBTrim->chromStart;
+uint refTrimEnd = spdiBTrim->chromEnd;
+char *altTrim = spdiBTrim->ins;
+int refTrimLen = refTrimEnd - refTrimStart;
+int altTrimLen = strlen(altTrim);
+if (indelShiftIsApplicable(refTrimLen, altTrimLen))
+    {
+    // Now shift the minimal representation left and right and compare with spdiB coords.
+    uint leftStart = refTrimStart, leftEnd = refTrimEnd;
+    char leftAlt[altTrimLen+1];
+    safecpy(leftAlt, sizeof leftAlt, altTrim);
+    int leftShifted = indelShift(seqWin, &leftStart, &leftEnd, leftAlt,
+                                 INDEL_SHIFT_NO_MAX, isdLeft);
+    uint rightStart = refTrimStart, rightEnd = refTrimEnd;
+    char rightAlt[altTrimLen+1];
+    safecpy(rightAlt, sizeof rightAlt, altTrim);
+    int rightShifted = indelShift(seqWin, &rightStart, &rightEnd, rightAlt,
+                                  INDEL_SHIFT_NO_MAX, isdRight);
+    if (leftShifted || rightShifted)
+        {
+        changed = TRUE;
+        // Expand spdiB->del to cover the whole range.,
+        int newRefLen = rightEnd - leftStart;
+        spdiB->del = lmAlloc(lm, newRefLen+1);
+        seqWindowCopy(seqWin, leftStart, newRefLen, spdiB->del, newRefLen+1);
+        // Concatenate ref bases to the left, altTrim, ref bases to the right -> new alt
+        char refLeft[leftShifted+1];
+        seqWindowCopy(seqWin, leftStart, leftShifted, refLeft, sizeof refLeft);
+        char refRight[rightShifted+1];
+        seqWindowCopy(seqWin, refTrimEnd, rightShifted, refRight, sizeof refRight);
+        int newAltLen = leftShifted + altTrimLen + rightShifted;
+        spdiB->ins = lmAlloc(lm, newAltLen+1);
+        safef(spdiB->ins, newAltLen+1, "%s%s%s", refLeft, altTrim, refRight);
+        if (obs)
+            obs->allele = spdiB->ins;
+        spdiB->chromStart = leftStart;
+        spdiB->chromEnd = rightEnd;
+        }
+    }
+return changed;
+}
+
+static void checkSpdisAfterExpansion(struct spdiBed *spdiBList, struct alObs *obsList)
+/* Check spdiBList and obsList for inconsistencies; if the reference allele spdiB+obs missed out
+ * on the expansion then expand them. */
+{
+boolean alreadyExpandedRefAllele = FALSE;
+struct spdiBed *spdiB;
+struct alObs *obs;
+for (spdiB = spdiBList, obs = obsList;
+     spdiB != NULL;
+     spdiB = spdiB->next, obs = obs->next)
+    {
+    if (strlen(spdiB->del) != spdiB->chromEnd - spdiB->chromStart)
+        errAbort("Inconsistent freq spdi coords & del: %s|%d|%d %s/%s",
+                 spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, spdiB->del, spdiB->ins);
+    if (spdiB != spdiBList)
+        {
+        if (differentString(spdiB->chrom, spdiBList->chrom))
+            errAbort("Differing freq spdiB chrom: '%s' vs. '%s'",
+                     spdiB->chrom, spdiBList->chrom);
+        if (differentString(spdiB->del, spdiBList->del))
+            {
+            // Ref allele can't be shifted, so expand it to alt's shifted range... but only once,
+            // so we'll catch differences between alt alleles' expanded ranges.
+            struct spdiBed *spdiBRef = NULL, *spdiBAlt = NULL;
+            struct alObs *obsRef = NULL;
+            if (sameString(spdiBList->del, spdiBList->ins))
+                {
+                spdiBRef = spdiBList;
+                spdiBAlt = spdiB;
+                obsRef = obsList;
+                }
+            else if (sameString(spdiB->del, spdiB->ins))
+                {
+                spdiBRef = spdiB;
+                spdiBAlt = spdiBList;
+                obsRef = obs;
+                }
+            if (spdiBRef != NULL && !alreadyExpandedRefAllele)
+                {
+                // Copy spdiBAlt's expanded coords and del into spdiBRef
+                spdiBRef->chromStart = spdiBAlt->chromStart;
+                spdiBRef->chromEnd = spdiBAlt->chromEnd;
+                spdiBRef->del = spdiBAlt->del;
+                // The reference allele's ins is the same as its del, and the corresponding
+                // obs->allele must match it.
+                spdiBRef->ins = spdiBRef->del;
+                obsRef->allele = spdiBRef->ins;
+                alreadyExpandedRefAllele = TRUE;
+                }
+            else
+                errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s",
+                         spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd,
+                         spdiBList->del, spdiBList->ins,
+                         spdiB->chrom, spdiB->chromStart, spdiB->chromEnd,
+                         spdiB->del, spdiB->ins);
+            }
+        }
+    }
+}
+
+static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm)
+/* Return a new spdiBed that is the minimal representation of spdiB. */
+{
+struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart,
+                                         lmCloneString(lm, spdiB->del),
+                                         lmCloneString(lm, spdiB->ins), lm);
+int refTrimLen = 0, altTrimLen = 0;
+trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd,
+           &refTrimLen, &altTrimLen);
+return spdiBTrim;
+}
+
+static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom,
+                              struct hash *ncToSeqWin, struct lm *lm)
+/* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do
+ * not cover the entire repetitive range.  Expand variant alleles on the sequence on which they
+ * were originally reported; if there is an indel difference between the reporting assembly and
+ * the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt
+ * may be different for different assemblies). */
+{
+int sIx;
+for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
+    {
+    struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx];
+    struct alObs *obs, *obsList = props->freqSourceObs[sIx];
+    assert(slCount(spdiBList) == slCount(obsList));
+    boolean changed = FALSE;
+    // For each allele, see if it can be slid around.
+    for (spdiB = spdiBList, obs = obsList;
+         spdiB != NULL;
+         spdiB = spdiB->next, obs = obs->next)
+        {
+        assert(sameString(obs->allele, spdiB->ins));
+        // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly.
+        if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) ||
+            stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del))
+            {
+            // First trim ref and alt to their minimal version for indelShift to work with.
+            // There may be nothing to trim because they're usually minimal already and
+            // stringIn catches false positive like A/TAAC, and that's fine.
+            struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
+            // If possible, expand the minimal representation to full SPDI range.
+            struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom);
+            if (seqWin == NULL)
+                errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom);
+            changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm);
+            }
+        }
+    if (changed)
+        checkSpdisAfterExpansion(spdiBList, obsList);
+    }
+}
+
+static double mafFromSource(struct sharedProps *props, int sourceIx,
+                            char **retMajorAllele, char **retMinorAllele, struct lm *lm)
+/* Compare allele counts from source and return minor allele frequency (2nd-highest).
+ * Set *retMajorAllele to allele with highest count (for comparison with mapped ref alleles).
+ * If there are no observations from source, return NAN (divide by zero). */
+{
+char *majorAllele = NULL, *minorAllele = NULL;
+int majorAlleleCount = -1, minorAlleleCount = -1;
+double totalCount = 0;
+struct alObs *obs;
+for (obs = props->freqSourceObs[sourceIx];  obs != NULL;  obs = obs->next)
+    {
+    if (obs->obsCount > majorAlleleCount)
+        {
+        minorAllele = majorAllele;
+        minorAlleleCount = majorAlleleCount;
+        majorAlleleCount = obs->obsCount;
+        majorAllele = obs->allele;
+        }
+    else if (obs->obsCount > minorAlleleCount)
+        {
+        minorAllele = obs->allele;
+        minorAlleleCount = obs->obsCount;
+        }
+    totalCount = obs->totalCount;
+    }
+if (totalCount > 0)
+    {
+    if (majorAlleleCount < 0)
+        errAbort("no major allele observed despite nonzero totalCount...?");
+    if (minorAlleleCount < 0)
+        errAbort("no minor allele observed... what should the minorAllele value be??");
+    }
+*retMajorAllele = lmCloneString(lm, majorAllele);
+*retMinorAllele = lmCloneString(lm, minorAllele);
+// If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test)
+return (double)minorAlleleCount / totalCount;
+}
+
+static void sortFrequencies(struct sharedProps *props, struct lm *lm)
+/* Determine major/minor allele and per-project MAF (if given).
+ * If -freqOrder option was given, then fill in props->freqSource* arrays in that order.
+ * Otherwise, choose the MAF from the project with the largest sample count as a fallback. */
+{
+if (props->freqSourceCount > 0)
+    {
+    if (freqSourceOrder != NULL)
+        {
+        lmAllocArray(lm, props->freqSourceMajorAl, freqSourceCount);
+        lmAllocArray(lm, props->freqSourceMinorAl, freqSourceCount);
+        lmAllocArray(lm, props->freqSourceMaf, freqSourceCount);
+        int sIx;
+        for (sIx = 0;  sIx < freqSourceCount;  sIx++)
+            {
+            props->freqSourceMaf[sIx] = mafFromSource(props, sIx,
+                                                      &props->freqSourceMajorAl[sIx],
+                                                      &props->freqSourceMinorAl[sIx], lm);
+            if (isfinite(props->freqSourceMaf[sIx]))
+                {
+                if (props->freqSourceMaf[sIx] >= 0.01)
+                    props->commonCount++;
+                else
+                    props->rareCount++;
+                }
+            }
+        }
+    else
+        {
+        // No freqSourceOrder -- just use source with biggest totalCount for this variant.
+        props->freqSourceCount = 1;
+        lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount);
+        lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount);
+        lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount);
+        props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx,
+                                                &props->freqSourceMajorAl[0],
+                                                &props->freqSourceMinorAl[0], lm);
+        if (isfinite(props->freqSourceMaf[0]))
+            {
+            if (props->freqSourceMaf[0] >= 0.01)
+                props->commonCount++;
+            else
+                props->rareCount++;
+            }
+        }
+    }
+}
+
+static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm)
+/* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */
+{
+struct sharedProps *props = NULL;
+lmAllocVar(lm, props);
+char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm);
+char rsName[strlen(rsNumber)+3];
+safef(rsName, sizeof rsName, "rs%s", rsNumber);
+props->name = lmCloneString(lm, rsName);
+char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm);
+props->class = varTypeToClass(varType);
+struct slRef *annotationsRef = jsonQueryElement(top, "top",
+                                                 "primary_snapshot_data.allele_annotations[*]", lm);
+parseFrequencies(annotationsRef, props, rsName, lm);
+spdiNormalizeFreq(props, ncToTwoBitChrom, ncToSeqWin, lm);
+sortFrequencies(props, lm);
+struct slName *soTermNames = jsonQueryStringList(annotationsRef, "annotations",
+                                                 "assembly_annotation[*].genes[*].rnas[*]"
+                                                 ".sequence_ontology[*].accession", lm);
+soTermNames = slCat(soTermNames,
+                    jsonQueryStringList(annotationsRef, "annotations",
+                                        "assembly_annotation[*].genes[*].rnas[*]"
+                                        ".protein.sequence_ontology[*].accession", lm));
+props->soTerms = soTermStringIdToIdList(soTermNames, lm);
+props->maxFuncImpact = props->soTerms ? props->soTerms->val : soUnknown;
+props->submitters = jsonQueryStrings(top, "top",
+                                    "primary_snapshot_data.support[id.type=subsnp].submitter_handle",
+                                        lm);
+slUniqify(&props->submitters, slNameCmp, NULL);
+props->pubMedIds = jsonQueryInts(top, "top", "citations[*]", lm);
+props->clinVarAccs = jsonQueryStringList(annotationsRef, "annotations",
+                                          "clinical[*].accession_version", lm);
+props->clinVarSigs = jsonQueryStringList(annotationsRef, "annotations",
+                                          "clinical[*].clinical_significances[*]", lm);
+return props;
+}
+
+static void setBdsFreqData(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds,
+                           struct dyString *dyUcscNotes, struct lm *lm)
+/* Use isRc, props to fill in bds->majorAllele, ->minorAllele, determine whether ref is maj/min.
+ * Add notes to dyUcscNotes if applicable. */
+{
+if (props->freqSourceCount > 0)
+    {
+    bds->freqSourceCount = props->freqSourceCount;
+    lmAllocArray(lm, bds->majorAllele, props->freqSourceCount);
+    lmAllocArray(lm, bds->minorAllele, props->freqSourceCount);
+    lmAllocArray(lm, bds->minorAlleleFreq, props->freqSourceCount);
+    char *firstMajorAllele = NULL;
+    int sIx;
+    for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
+        {
+        boolean freqIsRc = props->freqIsRc[sIx];
+        bds->minorAlleleFreq[sIx] = props->freqSourceMaf[sIx];
+        bds->majorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMajorAl[sIx]));
+        bds->minorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMinorAl[sIx]));
+        if (isfinite(bds->minorAlleleFreq[sIx]))
+            {
+            if (isRc != freqIsRc)
+                {
+                reverseComplement(bds->majorAllele[sIx], strlen(bds->majorAllele[sIx]));
+                reverseComplement(bds->minorAllele[sIx], strlen(bds->minorAllele[sIx]));
+                }
+            char *majorAllele = bds->majorAllele[sIx];
+            if (differentString(bds->ref, majorAllele))
+                dyStringAppend(dyUcscNotes, bdsRefIsMinor ",");
+            if (firstMajorAllele == NULL)
+                firstMajorAllele = majorAllele;
+            else if (majorAllele != NULL &&
+                     differentString(firstMajorAllele, majorAllele))
+                dyStringAppend(dyUcscNotes, bdsDiffMajor ",");
+            }
+        }
+    }
+}
+
+static void checkRareRef(struct sharedProps *props, char *ref, boolean isRc,
+                         struct dyString *dyUcscNotes)
+/* If frequency counts are reported, test whether the reference has ever been observed,
+ * and if so whether it is rare, i.e. < 1%.  If so, add to dyUcscNotes. */
+{
+double rareThreshold = 0.01;
+boolean haveFrequencies = FALSE, refWasObserved = FALSE;
+double maxRefAF = 0.0;
+int ix;
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    if (props->freqSourceObs[ix] != NULL)
+        haveFrequencies = TRUE;
+    boolean freqIsRc = props->freqIsRc[ix];
+    int refLen = strlen(ref);
+    char refCpy[refLen+1];
+    safecpy(refCpy, sizeof refCpy, ref);
+    if (isRc != freqIsRc)
+        reverseComplement(refCpy, refLen);
+    struct spdiBed *spdiB;
+    struct alObs *obs;
+    for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
+         spdiB != NULL && obs != NULL;
+         spdiB = spdiB->next, obs = obs->next)
+        {
+        if (sameString(refCpy, spdiB->ins))
+            {
+            if (obs->obsCount > 0)
+                {
+                refWasObserved = TRUE;
+                double refAF = (double)obs->obsCount / (double)obs->totalCount;
+                if (refAF > maxRefAF)
+                    maxRefAF = refAF;
+                }
+            break;
+            }
+        }
+    if (maxRefAF >= rareThreshold)
+        break;
+    }
+if (haveFrequencies && maxRefAF < rareThreshold)
+    {
+    dyStringAppend(dyUcscNotes, bdsRefIsRare ",");
+    if (!refWasObserved)
+        dyStringAppend(dyUcscNotes, bdsRefIsSingleton ",");
+    }
+}
+
+static enum bigDbSnpClass bigDbSnpClassFromAlleles(struct bigDbSnp *bds)
+/* Return what we expect bigDbSnpClass to be based on allele lengths. */
+{
+enum bigDbSnpClass alleleClass = 0;
+if (bds->altCount == 0)
+    alleleClass = bigDbSnpIdentity;
+else
+    {
+    int refLen = strlen(bds->ref);
+    boolean altLenZero = TRUE;
+    boolean altLenLtRef = FALSE;
+    boolean altLenEqRef = FALSE;
+    boolean altLenGtRef = FALSE;
+    int i;
+    for (i = 0;  i < bds->altCount;  i++)
+        {
+        int altLen = strlen(bds->alts[i]);
+        if (altLen < refLen)
+            altLenLtRef = TRUE;
+        else if (altLen == refLen)
+            altLenEqRef = TRUE;
+        else if (altLen > refLen)
+            altLenGtRef = TRUE;
+        if (altLen > 0)
+            altLenZero = FALSE;
+        }
+    if (!altLenLtRef &&  altLenEqRef && !altLenGtRef)
+        {
+        if (refLen == 1)
+            alleleClass = bigDbSnpSnv;
+        else
+            alleleClass = bigDbSnpMnv;
+        }
+    else if (bds->altCount == 0)
+        alleleClass = bigDbSnpIdentity;
+    else if (refLen == 0)
+        alleleClass = bigDbSnpIns;
+    else if (altLenZero)
+        alleleClass = bigDbSnpDel;
+    else
+        alleleClass = bigDbSnpDelins;
+    }
+return alleleClass;
+}
+
+static void checkIndelPlacement(struct bigDbSnp *bds, struct seqWindow *seqWin,
+                                struct lm *lm)
+/* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that
+ * bds's SPDI-based coords cover the whole repetitive region as expected. */
+{
+int i;
+for (i = 0;  i < bds->altCount;  i++)
+    {
+    char *alt = bds->alts[i];
+    if (isEmpty(bds->ref) || isEmpty(alt) ||
+        stringIn(bds->ref, alt) || stringIn(alt, bds->ref))
+        {
+        // First trim ref and alt to their minimal version for indelShift to work with.
+        // There may be nothing to trim because they're usually minimal already and
+        // stringIn catches false positive like A/TAAC, and that's fine.
+        struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm);
+        struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
+
+        maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm);
+        if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd)
+            errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') "
+                     "could be expanded to %s|%d|%d\n",
+                     bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt,
+                     bds->chrom, spdiB->chromStart, spdiB->chromEnd);
+        }
+    }
+}
+
+static boolean delMismatchesGenome(struct bigDbSnp *bds, struct seqWindow *seqWin)
+/* Return TRUE if bds->ref (spdi del) does not match assembly sequence at bds coords.
+ * Sometimes the genome has an N and dbSNP has a more specific IUPAC character.
+ * errAbort if there's some other kind of inconsistency. */
+{
+int refLen = strlen(bds->ref);
+uint refStart = bds->chromStart;
+uint refEnd = bds->chromEnd;
+if (refLen != refEnd - refStart)
+    errAbort("Inconsistent ref and coords for %s: ref is '%s' (%d bases) "
+             "but position is %s|%d|%d (%d bases)",
+             bds->name, bds->ref, refLen, bds->chrom, refStart, refEnd, refEnd - refStart);
+char genomeRef[refLen+1];
+seqWindowCopy(seqWin, refStart, refLen, genomeRef, sizeof genomeRef);
+if (differentString(genomeRef, bds->ref))
+    {
+    if (strchr(genomeRef, 'N'))
+        return TRUE;
+    else
+        errAbort("bds->ref '%s' mismatches genome sequence '%s'", bds->ref, genomeRef);
+    }
+return FALSE;
+}
+
+static boolean hasAmbiguousFreqAllele(struct sharedProps *props)
+/* Return TRUE if any allele from any frequency source contains an IUPAC ambiguous character. */
+{
+int sIx;
+for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
+    {
+    struct alObs *obs;
+    for (obs = props->freqSourceObs[sIx];  obs != NULL;  obs = obs->next)
+        if (anyIupac(obs->allele))
+            return TRUE;
+    }
+return FALSE;
+}
+
+static boolean freqHasNonRefAltAllele(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds)
+/* Return TRUE if any allele from any frequency source does not match any ref/alt (SPDI) allele. */
+{
+int sIx;
+for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
+    {
+    boolean freqIsRc = props->freqIsRc[sIx];
+    struct alObs *obs;
+    for (obs = props->freqSourceObs[sIx];  obs != NULL;  obs = obs->next)
+        {
+        int alLen = strlen(obs->allele);
+        char alCpy[alLen+1];
+        safecpy(alCpy, sizeof alCpy, obs->allele);
+        if (freqIsRc != isRc)
+            reverseComplement(alCpy, alLen);
+        boolean foundIt = FALSE;
+        if (sameString(alCpy, bds->ref))
+            foundIt = TRUE;
+        else
+            {
+            int i;
+            for (i = 0;  i < bds->altCount;  i++)
+                if (sameString(alCpy, bds->alts[i]))
+                    {
+                    foundIt = TRUE;
+                    break;
+                    }
+            }
+        if (!foundIt)
+            return TRUE;
+        }
+    }
+return FALSE;
+}
+
+static void addUcscNotes(struct sharedProps *props, boolean isRc, boolean isMultiMapper,
+                         struct bigDbSnp *bds, struct seqWindow *seqWin,
+                         struct dyString *dyUcscNotes, struct lm *lm)
+/* Record interesting conditions if applicable. */
+{
+if (props->class != bigDbSnpClassFromAlleles(bds))
+    dyStringAppend(dyUcscNotes, bdsClassMismatch ",");
+if (props->clinVarAccs != NULL)
+    dyStringAppend(dyUcscNotes, bdsClinvar ",");
+if (props->commonCount > 0)
+    {
+    dyStringAppend(dyUcscNotes, bdsCommonSome ",");
+    if (props->rareCount == 0)
+        dyStringAppend(dyUcscNotes, bdsCommonAll ",");
+    }
+if (isRc)
+    dyStringAppend(dyUcscNotes, bdsRevStrand ",");
+if (isMultiMapper)
+    dyStringAppend(dyUcscNotes, bdsMultiMap ",");
+checkRareRef(props, bds->ref, isRc, dyUcscNotes);
+if (delMismatchesGenome(bds, seqWin))
+    dyStringAppend(dyUcscNotes, bdsRefMismatch ",");
+if (anyIupac(bds->ref))
+    dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ",");
+int i;
+for (i = 0;  i < bds->altCount;  i++)
+    if (anyIupac(bds->alts[i]))
+        dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ",");
+if (hasAmbiguousFreqAllele(props))
+    dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ",");
+if (freqHasNonRefAltAllele(props, isRc, bds))
+    dyStringAppend(dyUcscNotes, bdsFreqNotRefAlt ",");
+bds->ucscNotes = lmCloneString(lm, dyUcscNotes->string);
+}
+
+static void calcShiftBases(struct bigDbSnp *bds)
+/* Return the max length of uncertain placement from all alt alleles. */
+{
+int maxBasesTrimmed = 0;
+int refLen = bds->chromEnd - bds->chromStart;
+char refTrim[refLen+1];
+int i;
+for (i = 0;  i < bds->altCount;  i++)
+    {
+    // Make copies of ref and alt, trim the copies, measure the change.
+    int refTrimLen = refLen;
+    safecpy(refTrim, sizeof refTrim, bds->ref);
+    int altTrimLen = strlen(bds->alts[i]);
+    char altTrim[altTrimLen+1];
+    safecpy(altTrim, sizeof altTrim, bds->alts[i]);
+    uint start = bds->chromStart, end = bds->chromEnd;
+    trimRefAlt(refTrim, altTrim, &start, &end, &refTrimLen, &altTrimLen);
+    int basesTrimmed = refLen - refTrimLen;
+    if (basesTrimmed > maxBasesTrimmed)
+        maxBasesTrimmed = basesTrimmed;
+    if (maxBasesTrimmed >= refLen)
+        break;
+    }
+bds->shiftBases = min(refLen, maxBasesTrimmed);
+}
+
+static struct bigDbSnp *parsePlacement(struct jsonElement *pl, struct sharedProps *props,
+                                       boolean isMultiMapper, struct dyString *dy, struct lm *lm)
+/* Return a bigDbSnp with information about one placement (mapping). */
+{
+struct bigDbSnp *bds;
+lmAllocVar(lm, bds);
+struct slRef *allSpdiRef = jsonQueryElement(pl, "placement", "alleles[*].allele.spdi", lm);
+bds->chrom = jsonQueryString(pl, "placement", "seq_id", lm);
+if (bds->chrom == NULL)
+    errAbort("Missing or NULL seq_id for %s", props->name);
+struct spdiBed *spdiBList = parseSpdis(allSpdiRef, props->name, lm);
+if (!sameString(bds->chrom, spdiBList->chrom))
+    errAbort("seq_id mismatch: placement has '%s' but spdi has '%s for %s'",
+             bds->chrom, spdiBList->chrom, props->name);
+bds->chromStart = spdiBList->chromStart;
+bds->chromEnd = spdiBList->chromEnd;
+bds->name = props->name;
+bds->ref = spdiBList->del;
+// spdiBList may or may not include no-change allele (del == ins)
+lmAllocArray(lm, bds->alts, slCount(spdiBList));
+struct spdiBed *spdiB;
+for (spdiB = spdiBList;  spdiB != NULL;  spdiB = spdiB->next)
+    {
+    if (!sameString(bds->ref, spdiB->ins))
+        bds->alts[bds->altCount++] = spdiB->ins;
+    }
+calcShiftBases(bds);
+boolean isRc = jsonQueryBoolean(pl, "placement",
+                                "placement_annot.is_aln_opposite_orientation", FALSE, lm);
+dyStringClear(dy);
+setBdsFreqData(props, isRc, bds, dy, lm);
+bds->maxFuncImpact = props->maxFuncImpact;
+bds->class = props->class;
+struct seqWindow *seqWin = getChromSeq(ncToSeqWin, bds->chrom, ncToTwoBitChrom);
+checkIndelPlacement(bds, seqWin, lm);
+addUcscNotes(props, isRc, isMultiMapper, bds, seqWin, dy, lm);
+return bds;
+}
+
+static void equivRegionAdd(struct hash *equivRegions, struct bed3 *region, struct bed3 *regionList)
+/* Associate regionList with region by looking up the binKeeper for region->chrom in equivRegions
+ * and adding regionList to the binKeeper for region start & end. */
+{
+struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
+if (bk == NULL)
+    {
+    bk = binKeeperNew(0, BINRANGE_MAXEND_512M);
+    hashAdd(equivRegions, region->chrom, bk);
+    }
+binKeeperAdd(bk, region->chromStart, region->chromEnd, regionList);
+}
+
+static struct hash *getEquivRegions()
+/* If not done already, read -equivRegions file if specified and parse it into data structure
+ * of equivalent regions for detecting legitimate cases of multiple mappings. */
+{
+static struct hash *equivRegions = NULL;
+static boolean alreadyInited = FALSE;
+if (!alreadyInited)
+    {
+    char *equivRegionsFile = optionVal("equivRegions", NULL);
+    if (isNotEmpty(equivRegionsFile))
+        {
+        equivRegions = hashNew(0);
+        struct lineFile *lf = lineFileOpen(equivRegionsFile, TRUE);
+        char *line;
+        while (lineFileNextReal(lf, &line))
+            {
+            // Build a list of equivalent regions; hash each region->chrom to list.
+            char *words[128];
+            int wordCount = chopLine(line, words);
+            if (wordCount >= ArraySize(words))
+                lineFileAbort(lf, "getEquivRegions: too many words per line (%d)", wordCount);
+            struct bed3 *regionList = NULL;
+            int i;
+            for (i = 0;  i < wordCount;  i++)
+                {
+                char *chrom = cloneString(words[i]);
+                char *colon = strchr(chrom, ':');
+                if (colon)
+                    {
+                    char *startStr = colon+1;
+                    char *nextColon = strchr(startStr, ':');
+                    if (! nextColon)
+                        lineFileAbort(lf, "getEquivRegions: expected each word to have "
+                                      "0 or 2 colons, but got '%s'", words[i]);
+                    *colon = '\0';
+                    *nextColon = '\0';
+                    char *endStr = nextColon+1;
+                    if (!isAllDigits(startStr) || !isAllDigits(endStr))
+                        lineFileAbort(lf, "getEquivRegions: expected a number following each colon, "
+                                      "but got '%s'", words[i]);
+                    int start = atoi(startStr);
+                    int end = atoi(endStr);
+                    slAddHead(&regionList, bed3New(chrom, start, end));
+                    }
+                else
+                    slAddHead(&regionList, bed3New(chrom, 0, 0));
+                }
+            // Sort so that a main chromosome goes before alts/fixes.
+            slSort(&regionList, bedCmp);
+            // Associate all regions with the same list.  we can use pointer equality later.
+            struct bed3 *region;
+            for (region = regionList;  region != NULL;  region = region->next)
+                equivRegionAdd(equivRegions, region, regionList);
+            }
+        lineFileClose(&lf);
+        }
+    else
+        warn("-equivRegions=FILE not specified; will not be able to detect variants "
+             "that are mapped to multiple non-equivalent regions.");
+    alreadyInited = TRUE;
+    }
+return equivRegions;
+}
+
+static struct slRef *equivRegionFind(struct hash *equivRegions, struct bed3 *region,
+                                     struct lm *lm)
+/* Return ref list of equivalent region lists that overlap region. */
+{
+struct binElement *beList = NULL;
+struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
+if (bk)
+    {
+    int start = region->chromStart;
+    int end = region->chromEnd;
+    // binKeeperFind can't find overlap with 0-length insertions (start == end) so expand those:
+    if (start == end)
+        {
+        start--;
+        end++;
+        }
+    beList = binKeeperFind(bk, start, end);
+    }
+struct slRef *refList = NULL;
+struct binElement *binEl;
+for (binEl = beList;  binEl != NULL;  binEl = binEl->next)
+    lmRefAdd(lm, &refList, binEl->val);
+slReverse(&refList);
+slFreeList(&beList);
+return refList;
+}
+
+static struct bed3 *bed3FromPlRef(struct slRef *plRef, struct lm *lm)
+/* Extract a bed3 from the first SPDI found in the first placement in plRef. */
+{
+struct jsonElement *pl = plRef->val;
+struct slRef *spdiRef = jsonQueryElement(pl, "placement", "alleles[0].allele.spdi", lm);
+struct jsonElement *spdiEl = spdiRef->val;
+struct bed3 *region;
+lmAllocVar(lm, region);
+region->chrom = jsonQueryString(spdiEl, "first spdi", "seq_id", lm);
+region->chromStart = jsonQueryInt(spdiEl, "first spdi", "position", -1, lm);
+char *ref = jsonQueryString(spdiEl, "first spdi", "deleted_sequence", lm);
+region->chromEnd = region->chromStart + strlen(ref);
+return region;
+}
+
+static struct bed3 *bed3ListFromPlRef(struct slRef *plRefList, struct lm *lm)
+/* Return a list of bed3 derived from placement SPDIs in plRef. */
+{
+struct bed3 *list = NULL;
+struct slRef *plRef;
+for (plRef = plRefList;  plRef != NULL;  plRef = plRef->next)
+    slAddHead(&list, bed3FromPlRef(plRef, lm));
+slReverse(&list);
+return list;
+}
+
+static boolean uniqChroms(struct bed3 *bedList)
+/* Return TRUE if each item of *sorted* bedList has a different chrom. */
+{
+boolean uniq = TRUE;
+if (bedList != NULL)
+    {
+    struct bed3 *bed;
+    for (bed = bedList;  bed->next != NULL;  bed = bed->next)
+        if (sameString(bed->chrom, bed->next->chrom))
+            {
+            uniq = FALSE;
+            break;
+            }
+    }
+return uniq;
+}
+
+static boolean allMappingsHaveMatch(struct slRef *equivRegionsA[], struct slRef *equivRegionsB[],
+                                    int plCount)
+// Return true if each placement's list of equivRegion sets in A has at least one set that
+// matches at least one set from some other placement's list in B.  A and B may/may not be the same.
+{
+int i;
+for (i = 0;  i < plCount;  i++)
+    {
+    struct slRef *plEquivList = equivRegionsA[i];
+    boolean foundMatch = FALSE;
+    int j;
+    for (j = 0;  j < plCount;  j++)
+        {
+        if (j == i)
+            continue;
+        struct slRef *otherEquivList = equivRegionsB[j];
+        struct slRef *plEquiv;
+        for (plEquiv = plEquivList;  plEquiv != NULL;  plEquiv = plEquiv->next)
+            {
+            struct slRef *otherEquiv;
+            for (otherEquiv = otherEquivList;  otherEquiv;  otherEquiv = otherEquiv->next)
+                {
+                // We can use pointer equality because getEquivRegions assigns same list
+                // pointer to each region in list.
+                if (plEquiv->val == otherEquiv->val)
+                    {
+                    foundMatch = TRUE;
+                    break;
+                    }
+                }
+            }
+        }
+    if (! foundMatch)
+        return FALSE;
+    }
+return TRUE;
+}
+
+static boolean detectMultiMappers(struct slRef *placements, struct lm *lm)
+/* Return TRUE if placements map to more than one genomic locus, aside from legitimate cases
+ * like the PAR regions on X and Y, or alt/fix sequences and their corresponding chromosomal
+ * locations. */
+{
+struct hash *equivRegions = getEquivRegions();
+int plCount = slCount(placements);
+if (equivRegions && plCount > 1)
+    {
+    // Get a bed3 list of placements and make sure there aren't multiple mappings on the same chrom.
+    struct bed3 *bedList = bed3ListFromPlRef(placements, lm);
+    slSort(&bedList, bedCmp);
+    if (!uniqChroms(bedList))
+        return TRUE;
+    // Gather up equivRegions lists for all placements.  If any placement has no equivRegions,
+    // then we're done.
+    struct slRef *plEquivRegions[plCount];
+    struct bed3 *bed;
+    int i;
+    for (i = 0, bed = bedList;  bed != NULL;  bed = bed->next, i++)
+        {
+        struct slRef *listOfLists = equivRegionFind(equivRegions, bed, lm);
+        if (listOfLists == NULL)
+            return TRUE;
+        plEquivRegions[i] = listOfLists;
+        }
+    if (!allMappingsHaveMatch(plEquivRegions, plEquivRegions, plCount))
+        {
+        // Sometimes the test for equivalent regions fails because all regions point to a
+        // common location -- but that location does not have a mapping, so its suite of
+        // regions are not in the mix.  For example, a variant is mapped to 4 alts of the
+        // same chrom -- but not to the chrom itself, so all regions point to the main chrom,
+        // but none of the 4 mappings' regions are in common with each other.
+        // So try harder before giving up: add equivalent regions that overlap original equivs.
+        struct slRef *plExpandedEquivs[plCount];
+        for (i = 0, bed = bedList;  bed != NULL;  bed = bed->next, i++)
+            {
+            struct slRef *expandedEquivs = NULL;
+            struct slRef *plEquiv;
+            for (plEquiv = plEquivRegions[i];  plEquiv != NULL;  plEquiv = plEquiv->next)
+                {
+                struct slRef *setRef;
+                for (setRef = plEquiv;  setRef != NULL;  setRef = setRef->next)
+                    {
+                    struct bed3 *set = setRef->val;
+                    struct bed3 *region;
+                    for (region = set;  region != NULL;  region = region->next)
+                        {
+                        if (differentString(bed->chrom, region->chrom))
+                            expandedEquivs = slCat(equivRegionFind(equivRegions, region, lm),
+                                                   expandedEquivs);
+                        }
+                    }
+                }
+            plExpandedEquivs[i] = expandedEquivs;
+            }
+        if (!allMappingsHaveMatch(plEquivRegions, plExpandedEquivs, plCount))
+            return TRUE;
+        }
+    }
+return FALSE;
+}
+
+static void dyStringPrintSlNameList(struct dyString *dy, struct slName *list, char *sep)
+/* Append list of names with separator to dy; separator appears after every item. Skip empty names. */
+{
+struct slName *item;
+for (item = list;  item != NULL;  item = item->next)
+    {
+    if (isNotEmpty(item->name))
+        {
+        dyStringAppend(dy, item->name);
+        dyStringAppend(dy, sep);
+        }
+    }
+}
+
+static void dyStringPrintSlIntList(struct dyString *dy, struct slInt *list, char *sep)
+/* Append list of names with separator to dy; separator appears after every item. */
+{
+struct slInt *item;
+for (item = list;  item != NULL;  item = item->next)
+    {
+    dyStringPrintf(dy, "%d", item->val);
+    dyStringAppend(dy, sep);
+    }
+}
+
+static void appendFrequencies(struct sharedProps *props, struct dyString *dy)
+/* Condense the frequency data in props into an autoSql array of compact strings and append to dy.
+ * Also add autoSql int array of total_count because those may vary between variants. */
+{
+dyStringPrintf(dy, "%d\t", props->freqSourceCount);
+int ix;
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    boolean freqIsRc = props->freqIsRc[ix];
+    struct alObs *obs;
+    for (obs = props->freqSourceObs[ix];  obs != NULL;  obs = obs->next)
+        {
+        if (obs != props->freqSourceObs[ix])
+            dyStringAppendC(dy, '|');
+        if (freqIsRc)
+            // Store all frequencies on + strand of "preferred top level placement"
+            reverseComplement(obs->allele, strlen(obs->allele));
+        dyStringPrintf(dy, "%s:%d", obs->allele, obs->obsCount);
+        if (freqIsRc)
+            // rc again to restore original values
+            reverseComplement(obs->allele, strlen(obs->allele));
+        }
+    dyStringAppendC(dy, ',');
+    }
+dyStringAppendC(dy, '\t');
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    if (props->freqSourceObs[ix] != NULL)
+        dyStringPrintf(dy, "%d,", props->freqSourceObs[ix]->totalCount);
+    else
+        dyStringAppend(dy, "0,");
+    }
+}
+
+static void writeDetails(struct sharedProps *props, struct dyString *dy, FILE *outPropsF)
+/* Write props out as tab-sep line to outPropsF. */
+{
+dyStringClear(dy);
+dyStringPrintf(dy, "%s\t", props->name);
+appendFrequencies(props, dy);
+dyStringPrintf(dy, "\t%d\t", slCount(props->soTerms));
+dyStringPrintSlIntList(dy, props->soTerms, ",");
+dyStringPrintf(dy, "\t%d\t", slCount(props->clinVarAccs));
+dyStringPrintSlNameList(dy, props->clinVarAccs, ",");
+dyStringAppendC(dy, '\t');
+dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
+dyStringPrintf(dy, "\t%d\t", slCount(props->submitters));
+dyStringPrintSlNameList(dy, props->submitters, ",");
+dyStringPrintf(dy, "\t%d\t", slCount(props->pubMedIds));
+dyStringPrintSlIntList(dy, props->pubMedIds, ",");
+dyStringAppendC(dy, '\n');
+fputs(dy->string, outPropsF);
+}
+
+static void dumpLineToFile(char *line, FILE *jsonF)
+/* Write line to jsonF, making sure it ends in \n. */
+{
+fputs(line, jsonF);
+if (line[strlen(line)-1] != '\n')
+    fputc('\n', jsonF);
+}
+
+static void updateSeqIsRc(struct slRef *placements, struct lm *lm)
+/* Note is_aln_opposite_orientation flag for each mapped sequence, regardless of whether it's
+ * in the assembly that we're working on; later we can look up sequences from freq reports. */
+{
+hashIntReset(seqIsRc);
+struct slRef *plRef;
+for (plRef = placements;  plRef != NULL;  plRef = plRef->next)
+    {
+    char *seqId = jsonQueryString(plRef->val, "placement", "seq_id", lm);
+    boolean isRc = jsonQueryBoolean(plRef->val, "placement",
+                                    "placement_annot.is_aln_opposite_orientation", FALSE, lm);
+    struct hashEl *hel = hashLookup(seqIsRc, seqId);
+    if (hel)
+        {
+        if (isRc != ptToInt(hel->val))
+            errAbort("updateSeqIsRc: sequence '%s' has both true and false for "
+                     "placement_annot.is_aln_opposite_orientation", seqId);
+        }
+    else
+        hashAddInt(seqIsRc, seqId, isRc);
+    }
+}
+
+struct perAssembly
+// For each assembly whose mappings we're extracting from JSON, we need the name,
+// a jsonQuery path to extract just the mappings for that assembly, and an output file.
+    {
+    struct perAssembly *next;
+    char *name;                  // assembly_name e.g. GRCh38.p12
+    char *placementPath;         // jsonQuery path to placements on assembly
+    char *plIsTopLevelPath;      // jsonQuery path to find whether placement is top level in assembly
+    FILE *outF;                  // bigDbSnp output file
+    };
+
+static void perAssemblyFree(struct perAssembly **pPa)
+/* Close *pPa's outF and free mem. */
+{
+if (pPa)
+    {
+    struct perAssembly *pa = *pPa;
+    freeMem(pa->name);
+    carefulClose(&pa->outF);
+    freeMem(pa->placementPath);
+    freeMem(pa->plIsTopLevelPath);
+    freez(pPa);
+    }
+}
+
+static void perAssemblyFreeList(struct perAssembly **pPaList)
+/* Close & free a list of perAssembly. */
+{
+if (pPaList)
+    {
+    struct perAssembly *el, *next;
+    for (el = *pPaList;  el != NULL;  el = next)
+        {
+        next = el->next;
+        perAssemblyFree(&el);
+        }
+    }
+}
+
+static void writeMerged(struct jsonElement *top, FILE *outMergedF, struct lm *lm)
+/* If any merged IDs are given, write the mappings out to outMergedF. */
+{
+struct slName *mergedIds = jsonQueryStrings(top, "top", "dbsnp1_merges[*].merged_rsid", lm);
+if (mergedIds)
+    {
+    char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
+    struct slName *mergedId;
+    for (mergedId = mergedIds;  mergedId != NULL;  mergedId = mergedId->next)
+        fprintf(outMergedF, "rs%s\trs%s\n", mergedId->name, rsId);
+    }
+}
+
+static char *getChrom(char *refSeqAcc, char *assembly)
+/* Look up "refSeqAcc assembly" in ncGrcToChrom; if found, return chrom, otherwise return
+ * refSeqAcc for translation later (e.g. NC_012920.1 liftOver to hg19 chrM).  Don't free ret. */
+{
+char ncGrc[strlen(refSeqAcc) + 1 + strlen(assembly) + 1];
+safef(ncGrc, sizeof ncGrc, "%s %s", refSeqAcc, assembly);
+char *chrom = hashFindVal(ncGrcToChrom, ncGrc);
+return chrom ? chrom : refSeqAcc;
+}
+
+static void parseDbSnpJson(char *line, struct perAssembly *assemblyProps,
+                           struct outStreams *outStreams)
+/* Each line of dbSNP's file contains one JSON blob describing an rs# variant.  Parse JSON,
+ * extract the bits that we want to keep, print out BED+ and accompanying files. */
+{
+struct lm *lm = lmInit(1<<16);
+struct jsonElement *top = jsonParseLm(line, lm);
+struct sharedProps *props = NULL;
+struct bigDbSnp *bds = NULL;
+struct slRef *allPlacements = jsonQueryElement(top, "top",
+                                               "primary_snapshot_data.placements_with_allele[*]",
+                                               lm);
+updateSeqIsRc(allPlacements, lm);
+struct dyString *dyScratch = dyStringNew(0);
+struct perAssembly *ap;
+for (ap = assemblyProps;  ap != NULL;  ap = ap->next)
+    {
+    struct slRef *placements = jsonQueryElement(top, "top", ap->placementPath, lm);
+    // Filter out scaffolds that are independent in some other assembly, but part of a regular
+    // chrom in the current assembly
+    struct slRef *plRef, *plRefNext, *filtered = NULL;
+    for (plRef = placements;  plRef != NULL;  plRef = plRefNext)
+        {
+        plRefNext = plRef->next;
+        struct jsonElement *pl = plRef->val;
+        boolean isTopLevel = jsonQueryBoolean(pl, "placement", ap->plIsTopLevelPath, FALSE, lm);
+        if (isTopLevel)
+            slAddHead(&filtered, plRef);
+        }
+    slReverse(&filtered);
+    placements = filtered;
+    boolean multiMapper = detectMultiMappers(placements, lm);
+    struct errCatch *errCatch = errCatchNew();
+//#*** NOTE: currently in some cases the alleles are incorrect on alt sequences where the
+//#*** alt has a different allele of the same variant (or just different sequence).
+//#*** We detect at least some of those by inconsistent pos/del in alt placement spdis,
+//#*** but who knows, there could be more.  Ultimately we will need to compare assembly
+//#*** sequences... unless/until dbSNP fixes their projection onto alts (JIRA VR-176).
+    if (errCatchStart(errCatch))
+        {
+        for (plRef = placements;  plRef != NULL;  plRef = plRef->next)
+            {
+            struct jsonElement *pl = plRef->val;
+            if (props == NULL)
+                {
+                // Write props only if we have at least one placement.
+                props = extractProps(top, lm);
+                writeDetails(props, dyScratch, outStreams->details);
+                writeMerged(top, outStreams->merged, lm);
+                }
+            bds = parsePlacement(pl, props, multiMapper, dyScratch, lm);
+            bds->chrom = getChrom(bds->chrom, ap->name);
+            bigDbSnpTabOut(bds, ap->outF);
+            // Prevent stale bds in errCatch below:
+            bds = NULL;
+            }
+        }
+    errCatchEnd(errCatch);
+    if (errCatch->gotError)
+        {
+        dumpLineToFile(line, outStreams->failJson);
+        char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
+        char *seqId = bds ? bds->chrom : "NOSEQ";
+        fprintf(outStreams->err, "%s\t%s\t%s", rsId, seqId, errCatch->message->string);
+        }
+    else if (isNotEmpty(errCatch->message->string))
+        warn("%s", errCatch->message->string);
+    errCatchFree(&errCatch);
+    }
+dyStringFree(&dyScratch);
+lmCleanup(&lm);
+}
+
+static struct slName *initFreqSourceOrder()
+/* If -freqSourceOrder option was given, extract source names from the comma-sep list
+ * into globals freqSourceCount and freqSourceOrder[]. */
+{
+char *freqSourceOrderStr = optionVal("freqSourceOrder", NULL);
+struct slName *sources = NULL;
+if (freqSourceOrderStr)
+    {
+    sources = slNameListFromComma(freqSourceOrderStr);
+    freqSourceCount = slCount(sources);
+    AllocArray(freqSourceOrder, freqSourceCount);
+    struct slName *source;
+    int i;
+    for (i = 0, source = sources;  i < freqSourceCount;  i++, source = source->next)
+        freqSourceOrder[i] = source->name;
+    }
+return sources;
+}
+
+static struct perAssembly *getAssemblyProps(char *assemblyList, char *outRoot)
+/* Split comma-sep assemblyList and make placement path and bigDbSnp output file handle for each. */
+{
+struct perAssembly *assemblyProps = NULL;
+struct slName *assembly, *assemblies = slNameListFromComma(assemblyList);
+for (assembly = assemblies;  assembly != NULL;  assembly = assembly->next)
+    {
+    struct perAssembly *ap;
+    AllocVar(ap);
+    ap->name = cloneString(assembly->name);
+    // Fancy path to get all placements that are on the desired assembly.
+    struct dyString *dy = dyStringCreate("primary_snapshot_data.placements_with_allele"
+                                  "[placement_annot.seq_id_traits_by_assembly[*].assembly_name=%s]",
+                                         assembly->name);
+    ap->placementPath = dyStringCannibalize(&dy);
+    dy = dyStringCreate("placement_annot.seq_id_traits_by_assembly[assembly_name=%s].is_top_level",
+                        assembly->name);
+    ap->plIsTopLevelPath = dyStringCannibalize(&dy);
+    char prefix[2048];
+    safef(prefix, sizeof prefix, "%s.%s", outRoot, assembly->name);
+    ap->outF = openOutFile("%s.bigDbSnp", prefix);
+    slAddHead(&assemblyProps, ap);
+    }
+return assemblyProps;
+}
+
+static void parseRefSeqToUcsc(char *refSeqToUcsc, struct hash *ncGrcToChrom,
+                              struct hash *ncToTwoBitChrom)
+/* Extract mappings of {NC_*, GRC*} to chrom and NC_* to {twoBitFile, chrom}. */
+{
+struct lineFile *lf = lineFileOpen(refSeqToUcsc, TRUE);
+char *line;
+while (lineFileNextReal(lf, &line))
+    {
+    char *row[5];
+    int wordCount = chopTabs(line, row);
+    lineFileExpectWords(lf, 4, wordCount);
+    char *acc = row[0];
+    char *assembly = row[1];
+    char *twoBitFile = row[2];
+    char *chrom = row[3];
+    if (!regexMatch(acc, "^N[A-Z]_[0-9]+\\.[0-9]+$"))
+        lineFileAbort(lf, "'%s' does not look like a RefSeq sequence accession "
+                      "(expecting N*_number.number in first column)", acc);
+    if (!startsWith("GRCh", assembly))
+        lineFileAbort(lf, "'%s' does not look like a GRCh* assembly name "
+                      "(expecting GRCh* in second column", assembly);
+    if (!endsWith(twoBitFile, ".2bit"))
+        lineFileAbort(lf, "'%s' does not look like a twoBit file "
+                      "(expecting *.2bit in third column)", twoBitFile);
+    if (!regexMatch(chrom, "^chr[0-9A-Za-z_]+$"))
+        lineFileAbort(lf, "'%s' does not look like an hg* chrom name "
+                      "(expecting 'chr' + alphanumeric/_ in fourth column)", chrom);
+    char accAssembly[strlen(acc) + 1 + strlen(assembly) + 1];
+    safef(accAssembly, sizeof accAssembly, "%s %s", acc, assembly);
+    hashAdd(ncGrcToChrom, accAssembly, cloneString(chrom));
+    char twoBitChrom[strlen(twoBitFile) + 1 + strlen(chrom) + 1];
+    safef(twoBitChrom, sizeof twoBitChrom, "%s:%s", twoBitFile, chrom);
+    hashAdd(ncToTwoBitChrom, acc, cloneString(twoBitChrom));
+    }
+lineFileClose(&lf);
+}
+
+void dbSnpJsonToTab(char *assemblyList, char *refSeqToUcsc, char *jsonFile, char *outRoot)
+/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
+{
+struct slName *sources = initFreqSourceOrder();
+struct lineFile *lf = lineFileOpen(jsonFile, TRUE);
+struct outStreams *outStreams = outStreamsOpen(outRoot);
+struct perAssembly *assemblyProps = getAssemblyProps(assemblyList, outRoot);
+seqIsRc = hashNew(0);
+ncGrcToChrom = hashNew(0);
+ncToTwoBitChrom = hashNew(0);
+ncToSeqWin = hashNew(0);
+parseRefSeqToUcsc(refSeqToUcsc, ncGrcToChrom, ncToTwoBitChrom);
+char *line = NULL;
+while (lineFileNext(lf, &line, NULL))
+    parseDbSnpJson(line, assemblyProps, outStreams);
+lineFileClose(&lf);
+perAssemblyFreeList(&assemblyProps);
+outStreamsClose(&outStreams);
+// Free these up for squeaky-clean valgrind --leak-check
+// [well, except for parseOptions and errCatch getStack/getThreadVars, but whatever]
+hashFree(&seqIsRc);
+hashFree(&ncGrcToChrom);
+hashFree(&ncToTwoBitChrom);
+hashFree(&ncToSeqWin);
+slNameFreeList(&sources);
+}
+
+int main(int argc, char *argv[])
+/* Process command line. */
+{
+optionInit(&argc, argv, options);
+if (argc != 5)
+    usage();
+dbSnpJsonToTab(argv[1], argv[2], argv[3], argv[4]);
+return 0;
+}