19da7bcf56ec86a80554a364d191d31b00d95f3e
angie
  Fri Nov 1 13:51:49 2019 -0700
dbSnpJsonToTab was including refIsMinor multiple times for same variant - fixed.  refs #23283

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index de03bd2..c10f156 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -1,1780 +1,1786 @@
 /* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
 
 /* Copyright (C) 2019 The Regents of the University of California */
 
 #include "common.h"
 #include "basicBed.h"
 #include "bigDbSnp.h"
 #include "binRange.h"
 #include "dnaseq.h"
 #include "dnautil.h"
 #include "errCatch.h"
 #include "hash.h"
 #include "indelShift.h"
 #include "iupac.h"
 #include "jsonQuery.h"
 #include "linefile.h"
 #include "obscure.h"
 #include "options.h"
 #include "regexHelper.h"
 #include "soTerm.h"
 #include "twoBit.h"
 
 void usage()
 /* Explain usage and exit. */
 {
 errAbort(
   "dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated track files\n"
   "usage:\n"
   "   dbSnpJsonToTab assemblyList refSeqToUcsc refsnp-chrN.json[.gz] outRoot\n"
   "assemblyList is a comma-separated set of assembly_name values in JSON, e.g. 'GRCh37.p13,GRCh38.p12'\n"
   "refSeqToUcsc is tab-sep, 4 columns: RefSeq acc (NC_...), assembly_name, twoBitFile, ucscName (chr...)\n"
   "Track files starting with outRoot will be created:\n"
   "  outRoot.<assembly>.bigDbSnp: BED4+ for main dbSNP track in each assembly in assemblyList\n"
   "                               (final columns need to be added later by bedJoinTabOffset)\n"
   "  outRootDetails.tab: tab-separated allele frequency counts and other details (for bedJoinTabOffset)\n"
   "Additional files are written out for error reporting:\n"
   "  outRootFailed.json: lines of JSON input that encountered an error, for debugging/reprocessing\n"
   "  outRootMerged.tab: two columns, obsolete dbSNP rs# ID and current rs# ID\n"
   "  outRootErrors.tab: lines of error info\n"
   "\n"
   "\n"
   "options:\n"
   "   -freqSourceOrder=LIST      LIST is an ordered, comma-sep list of project names (no spaces)\n"
   "                              for frequency data submitted to dbSNP, e.g. '1000genomes,GnomAD'\n"
   "                              If given, then the bigDbSnp minorAlleleFreq array uses the order.\n"
   "                              Otherwise minorAlleleFreq contains only one frequency, from the\n"
   "                              project with the highest sample count reported for that variant.\n"
   "   -equivRegions=FILE         Each line of FILE contains space-separated acc:start:end regions\n"
   "                              that are equivalent, for example PAR1 or PAR2 on X and Y,\n"
   "                              or an alt sequence and the corresponding portion of its chrom.\n"
   "                              These use RefSeq accesions not UCSC chr names, so that multiple\n"
   "                              assemblies can be described in the same file.\n"
   );
 }
 
 // Globals
 int freqSourceCount = 0;
 char **freqSourceOrder = NULL;
 struct hash *seqIsRc = NULL;
 struct hash *ncGrcToChrom = NULL;
 struct hash *ncToTwoBitChrom = NULL;
 struct hash *ncToSeqWin = NULL;
 
 /* Command line validation table. */
 static struct optionSpec options[] = {
     {"freqSourceOrder", OPTION_STRING},
     {"equivRegions", OPTION_STRING},
     {NULL, 0},
 };
 
 struct outStreams
 /* Container for various output files that are not assembly-specific */
     {
     FILE *details;    // outRootDetails.tab
     FILE *failJson;   // outRootFailed.json
     FILE *merged;     // outRootMerged.tab
     FILE *err;        // outRootErrors.tab
     };
 
 static FILE *openOutFile(char *format, char *outRoot)
 /* Open a file for writing, named by format (which must have exactly one %s) and outRoot. */
 {
 struct dyString *name = dyStringCreate(format, outRoot);
 FILE *outF = mustOpen(name->string, "w");
 dyStringFree(&name);
 return outF;
 }
 
 static struct outStreams *outStreamsOpen(char *outRoot)
 /* Open output file handles */
 {
 struct outStreams *outStreams;
 AllocVar(outStreams);
 outStreams->details = openOutFile("%sDetails.tab", outRoot);
 outStreams->failJson = openOutFile("%sFailed.json", outRoot);
 outStreams->merged = openOutFile("%sMerged.tab", outRoot);
 outStreams->err = openOutFile("%sErrors.tab", outRoot);
 return outStreams;
 }
 
 static void outStreamsClose(struct outStreams **pOutStreams)
 /* Close file handles and free struct. */
 {
 if (*pOutStreams != NULL)
     {
     struct outStreams *outStreams = *pOutStreams;
     carefulClose(&outStreams->details);
     carefulClose(&outStreams->failJson);
     carefulClose(&outStreams->merged);
     carefulClose(&outStreams->err);
     freez(pOutStreams);
     }
 }
 
 struct spdiBed
 /* dbSNP 2.0 variant representation: {Sequence, Position, Deletion, Insertion}, expanded to
  * cover ambiguous placement region for indels when applicable. Add chromEnd to get bed3+. */
 {
     struct spdiBed *next;
     char *chrom;           // Sequence ID
     uint chromStart;       // Position: 0-based start
     uint chromEnd;         // Position + number of deleted bases
     char *del;             // Deleted sequence (starting at pos) -- or number of deleted bases
     char *ins;             // Inserted sequence
 };
 
 struct alObs
 // Unit of variant allele frequency observation
     {
     struct alObs *next;
     char *allele;   // Variant allele sequence
     int obsCount;   // Number of chromosomes on which allele was observed by a project
     int totalCount; // Number of chromosomes on which project observed some allele of this variant
     };
 
 struct sharedProps
 /* Properties of a refsnp that are shared by all mappings to all assemblies. */
     {
     // Extracted from JSON:
     char *name;                       // rs# ID
     int freqSourceCount;              // Number of projects reporting allele frequency observations
     struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs
     struct alObs **freqSourceObs;     // Array of per-project observation lists
     char **freqSourceMajorAl;         // Array of per-project major allele
     char **freqSourceMinorAl;         // Array of per-project minor allele
     double *freqSourceMaf;            // Array of per-project minor allele frequencies
     int biggestSourceIx;              // index of freqSourceObs with highest totalCount
     boolean *freqIsRc;                // Per-project flag for sequence isRc relative to ptlp
     struct slInt *soTerms;            // Sequence Ontology functional effect terms
     struct slName *submitters;        // dbSNP submitter handles
     struct slInt *pubMedIds;          // PubMed article citations
     struct slName *clinVarAccs;       // Accessions of ClinVar annotations, i.e. ClinVar RCV*
     struct slName *clinVarSigs;       // Reported ClinVar significance
     enum bigDbSnpClass class;         // {snv, mnv, ins, del, delins, identity}
     enum soTerm maxFuncImpact;        // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic}
     int commonCount;                  // Number of projects reporting minor allele freq >= 1%
     int rareCount;                    // Number of projects reporting minor allele freq < 1%
     };
 
 struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm)
 /* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm.
  * Checking validity of alleles is up to caller, but this does check whether del is a number
  * instead of sequence. */
 {
 struct spdiBed *spdi;
 lmAllocVar(lm, spdi);
 spdi->chrom = seq;
 spdi->chromStart = pos;
 spdi->del = del;
 spdi->ins = ins;
 if (isAllDigits(del))
     spdi->chromEnd = pos + atoi(del);
 else
     spdi->chromEnd = spdi->chromStart + strlen(del);
 return spdi;
 }
 
 static boolean seqIsNucleotide(const char *seq)
 /* Return TRUE unless seq contains a character that is not in the IUPAC nucleotide set. */
 {
 if (seq == NULL)
     return FALSE;
 const char *p;
 for (p = seq;  *p != '\0';  p++)
     {
     if (!isIupac(*p))
         return FALSE;
     }
 return TRUE;
 }
 
 static struct spdiBed *parseSpdis(struct slRef *spdiList, char *name, struct lm *lm)
 /* Make sure that all SPDI objects have the consistent seq_id, deleted_sequence and position. */
 {
 struct spdiBed *spdiBList = NULL;
 char *firstSeq = NULL;
 int firstPos = -1;
 char *firstDel = NULL;
 struct slRef *spdiRef;
 for (spdiRef = spdiList;  spdiRef != NULL;  spdiRef = spdiRef->next)
     {
     struct jsonElement *spdiEl = spdiRef->val;
     char *seq = jsonQueryString(spdiEl, "spdi", "seq_id", lm);
     int pos = jsonQueryInt(spdiEl, "spdi", "position", -1, lm);
     char *del = jsonQueryString(spdiEl, "spdi", "deleted_sequence", lm);
     char *ins = jsonQueryString(spdiEl, "spdi", "inserted_sequence", lm);
     if (seq == NULL)
         errAbort("Missing or null seq_id in spdi for %s", name);
     if (pos < 0)
         errAbort("Invalid or missing position in spdi for %s", name);
     if (del == NULL)
         errAbort("Missing or null deleted_sequence in spdi for %s", name);
     if (ins == NULL)
         errAbort("Missing or null inserted_sequence in spdi for %s", name);
     if (! seqIsNucleotide(del))
         errAbort("Invalid deleted_sequence '%s' in spdi for %s", del, name);
     if (! seqIsNucleotide(ins))
         errAbort("Invalid inserted_sequence '%s' in spdi for %s", ins, name);
     if (firstSeq == NULL)
         {
         // First spdi
         firstSeq = seq;
         firstDel = del;
         firstPos = pos;
         }
     else
         {
         if (differentString(firstSeq, seq))
             errAbort("Mismatching seq_id ('%s' vs '%s') in spdis for %s",
                      firstSeq, seq, name);
         if (differentString(firstDel, del))
             errAbort("Mismatching deleted sequence ('%s' vs '%s') in %s spdis for %s",
                      firstDel, del, seq, name);
         if (firstPos != pos)
             errAbort("Mismatching position (%d vs %d) in %s spdis for %s",
                      firstPos, pos, seq, name);
         }
     slAddHead(&spdiBList, spdiBedNewLm(seq, pos, del, ins, lm));
     }
 slReverse(&spdiBList);
 return spdiBList;
 }
 
 static enum bigDbSnpClass varTypeToClass(char *varType)
 /* Convert JSON variant_type to bigDbSnp enum form. */
 {
 if (sameString(varType, "snv"))
     return bigDbSnpSnv;
 else if (sameString(varType, "mnv"))
     return bigDbSnpMnv;
 else if (sameString(varType, "ins"))
     return bigDbSnpIns;
 else if (sameString(varType, "del"))
     return bigDbSnpDel;
 else if (sameString(varType, "delins"))
     return bigDbSnpDelins;
 else if (sameString(varType, "identity"))
     return bigDbSnpIdentity;
 else
     errAbort("varTypeToClass: unrecognized variant_type '%s'.", varType);
 return bigDbSnpSnv;  // we never get here; appease gcc
 }
 
 static int freqSourceToIx(char *source, struct slName **pSourceList, struct lm *lm)
 /* Convert source name to index using freqSourceOrder if defined, otherwise sourceList
  * (expand sourceList if source is not already in the list). */
 {
 int ix = -1;
 if (freqSourceOrder)
     {
     ix = stringArrayIx(source, freqSourceOrder, freqSourceCount);
     if (ix < 0)
         errAbort("freqSourceOrder does not contains '%s'", source);
     }
 else
     {
     ix = slNameFindIx(*pSourceList, source);
     if (ix < 0)
         {
         ix = slCount(*pSourceList);
         slAddTail(pSourceList, lmSlName(lm, source));
         }
     }
 return ix;
 }
 
 static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource,
                                 struct spdiBed **retSpdiB, struct lm *lm)
 /* Parse one allele observation. */
 {
 struct alObs *obs = NULL;
 lmAllocVar(lm, obs);
 char *source = jsonQueryString(obsEl, "alObs", "study_name", lm);
 if (source == NULL)
     errAbort("parseAlObs: Missing or null study_name");
 struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm);
 struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm);
 obs->allele = spdiB->ins;
 obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm);
 obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm);
 if (obs->obsCount < 0)
     errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)",
              source, obs->allele);
 if (obs->totalCount < 0)
     errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)",
              source, obs->allele);
 if (obs->totalCount == 0)
     errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)",
              source, obs->allele);
 *retSource = source;
 *retSpdiB = spdiB;
 return obs;
 }
 
 static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm)
 /* If the reference allele was omitted from frequency reports (dbSNP JIRA VR-178), infer the count
  * and add the reference allele to lists of spdis and obs. */
 {
 int ix;
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     if (props->freqSourceSpdis[ix] == NULL)
         continue;
     boolean gotRef = FALSE;
     int sumCounts = 0;
     struct spdiBed *spdiB;
     struct alObs *obs;
     for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
          spdiB != NULL && obs != NULL;
          spdiB = spdiB->next, obs = obs->next)
         {
         if (sameString(spdiB->del, spdiB->ins))
             {
             gotRef = TRUE;
             break;
             }
         sumCounts += obs->obsCount;
         }
     if (!gotRef)
         {
         struct spdiBed *firstSpdiB = props->freqSourceSpdis[ix];
         struct alObs *firstObs = props->freqSourceObs[ix];
         if (sumCounts > firstObs->totalCount)
             errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref "
                      "counts %d", rsId, ix, firstObs->totalCount, sumCounts);
         struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart,
                                              firstSpdiB->del, firstSpdiB->del, lm);
         struct alObs *obs;
         lmAllocVar(lm, obs);
         obs->allele = spdiB->del;
         obs->obsCount = firstObs->totalCount - sumCounts;
         obs->totalCount = firstObs->totalCount;
         slAddHead(&props->freqSourceSpdis[ix], spdiB);
         slAddHead(&props->freqSourceObs[ix], obs);
         }
     }
 }
 
 static void checkFreqSourceObs(struct sharedProps *props, char *rsId)
 /* After frequency allele observations have been sorted into per-project lists,
  * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent.
  * The sum of obsCounts may be less than total_count (no-calls), but not greater. */
 {
 int ix;
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     int totalCount = 0;
     struct alObs *obsList = props->freqSourceObs[ix], *obs;
     for (obs = obsList;  obs != NULL;  obs = obs->next)
         {
         if (! seqIsNucleotide(obs->allele))
             errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId);
         if (obs->obsCount < 0)
             errAbort("checkFreqSourceObs: invalid obsCount %d for %s", obs->obsCount, rsId);
         if (obs->obsCount > obs->totalCount)
             errAbort("checkFreqSourceObs: obsCount %d > totalCount %d for %s",
                      obs->obsCount, obs->totalCount, rsId);
         totalCount += obs->obsCount;
         }
     for (obs = obsList;  obs != NULL;  obs = obs->next)
         {
         if (obs->totalCount < totalCount)
             errAbort("checkFreqSourceObs: obsCounts sum to %d, but total_count is %d for %s",
                      totalCount, obs->totalCount, rsId);
         else if (obs->totalCount != obsList->totalCount)
             errAbort("checkFreqSourceObs: inconsistent totalCount (%d for '%s' != %d for '%s' "
                      "for %s", obs->totalCount, obs->allele, obsList->totalCount, obsList->allele,
                      rsId);
         }
     }
 }
 
 static void parseFrequencies(struct slRef *annotationsRef, struct sharedProps *props,
                              char *rsId, struct lm *lm)
 /* Extract per-project allele frequency count data from JSON.  Note: the alleles are from the
  * reference assembly that the project used; current reference might be different. */
 {
 struct slRef *obsList = jsonQueryElementList(annotationsRef, "annotations", "frequency[*]", lm);
 if (obsList != NULL)
     {
     if (freqSourceOrder)
         {
         props->freqSourceCount = freqSourceCount;
         lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount);
         lmAllocArray(lm, props->freqSourceObs, freqSourceCount);
         lmAllocArray(lm, props->freqIsRc, freqSourceCount);
         }
     else
         {
         props->freqSourceCount = 0;
         int allocSize = slCount(obsList);
         lmAllocArray(lm, props->freqSourceSpdis, allocSize);
         lmAllocArray(lm, props->freqSourceObs, allocSize);
         lmAllocArray(lm, props->freqIsRc, allocSize);
         }
     struct slName *unorderedSourceList = NULL;
     int maxSampleCount = 0;
     int biggestSourceIx = 0;
     struct slRef *obsRef;
     for (obsRef = obsList;  obsRef != NULL;  obsRef = obsRef->next)
         {
         struct jsonElement *obsEl = obsRef->val;
         char *source = NULL;
         struct spdiBed *spdiB = NULL;
         struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm);
         int ix = freqSourceToIx(source, &unorderedSourceList, lm);
         slAddHead(&props->freqSourceSpdis[ix], spdiB);
         slAddHead(&props->freqSourceObs[ix], obs);
         props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1);
         if (props->freqIsRc[ix] < 0)
             errAbort("parseFrequencies: Unknown orientation for %s seq_id %s",
                      rsId, spdiB->chrom);
         if (obs->totalCount > maxSampleCount)
             {
             maxSampleCount = obs->totalCount;
             biggestSourceIx = ix;
             }
         }
     if (!freqSourceOrder)
         props->freqSourceCount = slCount(unorderedSourceList);
     int ix;
     for (ix = 0;  ix < props->freqSourceCount; ix++)
         {
         slReverse(&props->freqSourceSpdis[ix]);
         slReverse(&props->freqSourceObs[ix]);
         }
     addMissingRefAllele(props, rsId, lm);
     checkFreqSourceObs(props, rsId);
     props->biggestSourceIx = biggestSourceIx;
     }
 }
 
 static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm)
 /* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional
  * impact, discard duplicates and return as slInt list with highest impact first. */
 {
 struct slInt *intList = NULL;
 if (soTermNames)
     {
     int termCount = slCount(soTermNames);
     enum soTerm termArray[termCount];
     struct slName *term;
     int i;
     for (i = 0, term = soTermNames;  term != NULL;  i++, term = term->next)
         termArray[i] = soTermStringIdToId(term->name);
     assert(i == termCount);
     qsort(termArray, termCount, sizeof termArray[0], soTermCmp);
     for (i = 0;  i < termCount; i++)
         {
         if (i == 0 || termArray[i] != termArray[i-1])
             {
             struct slInt *intEl;
             lmAllocVar(lm, intEl);
             intEl->val = termArray[i];
             slAddHead(&intList, intEl);
             }
         }
     slReverse(&intList);
     }
 return intList;
 }
 
 struct seqWindow *getChromSeq(struct hash *ncToSeqWin, char *nc, struct hash *ncToTwoBitChrom)
 /* Look up nc in ncToSeqWin.  If not found, look it up in ncToTwoBitChrom, make a seqWin,
  * store it in ncToSeqWin and return it.  Return NULL if neither hash has nc. */
 {
 struct seqWindow *seqWin = hashFindVal(ncToSeqWin, nc);
 if (seqWin == NULL)
     {
     char *twoBitChrom = hashFindVal(ncToTwoBitChrom, nc);
     if (twoBitChrom != NULL)
         {
         struct dnaSeq *dnaSeq = twoBitLoadAll(twoBitChrom);
         if (dnaSeq != NULL)
             {
             // Use memSeqWindow instead of twoBitSeqWindow to avoid keeping a bunch of open
             // twoBitFile handles.
             // Hmm, a dnaSeqWindow would be more efficient, no cloning and strlen...
             // or at the least, memSeqWindowNew could take a size arg.
             seqWin = memSeqWindowNew(twoBitChrom, dnaSeq->dna);
             dnaSeqFree(&dnaSeq);
             hashAdd(ncToSeqWin, nc, seqWin);
             }
         }
     }
 return seqWin;
 }
 
 static boolean maybeExpandRange(struct spdiBed *spdiBTrim, struct seqWindow *seqWin,
                                 struct spdiBed *spdiB, struct alObs *obs, struct lm *lm)
 /* If spdiBTrim is a pure insertion or deletion within a repetitive region,
  * then expand spdiB (and obs if non-NULL) to the full repetitive range and return TRUE.
  * Trim spdiBTrim to minimal representation (see maybeTrimSpdi) before calling this. */
 {
 boolean changed = FALSE;
 uint refTrimStart = spdiBTrim->chromStart;
 uint refTrimEnd = spdiBTrim->chromEnd;
 char *altTrim = spdiBTrim->ins;
 int refTrimLen = refTrimEnd - refTrimStart;
 int altTrimLen = strlen(altTrim);
 if (indelShiftIsApplicable(refTrimLen, altTrimLen))
     {
     // Now shift the minimal representation left and right and compare with spdiB coords.
     uint leftStart = refTrimStart, leftEnd = refTrimEnd;
     char leftAlt[altTrimLen+1];
     safecpy(leftAlt, sizeof leftAlt, altTrim);
     int leftShifted = indelShift(seqWin, &leftStart, &leftEnd, leftAlt,
                                  INDEL_SHIFT_NO_MAX, isdLeft);
     uint rightStart = refTrimStart, rightEnd = refTrimEnd;
     char rightAlt[altTrimLen+1];
     safecpy(rightAlt, sizeof rightAlt, altTrim);
     int rightShifted = indelShift(seqWin, &rightStart, &rightEnd, rightAlt,
                                   INDEL_SHIFT_NO_MAX, isdRight);
     if (leftShifted || rightShifted)
         {
         changed = TRUE;
         // Expand spdiB->del to cover the whole range.,
         int newRefLen = rightEnd - leftStart;
         spdiB->del = lmAlloc(lm, newRefLen+1);
         seqWindowCopy(seqWin, leftStart, newRefLen, spdiB->del, newRefLen+1);
         // Concatenate ref bases to the left, altTrim, ref bases to the right -> new alt
         char refLeft[leftShifted+1];
         seqWindowCopy(seqWin, leftStart, leftShifted, refLeft, sizeof refLeft);
         char refRight[rightShifted+1];
         seqWindowCopy(seqWin, refTrimEnd, rightShifted, refRight, sizeof refRight);
         int newAltLen = leftShifted + altTrimLen + rightShifted;
         spdiB->ins = lmAlloc(lm, newAltLen+1);
         safef(spdiB->ins, newAltLen+1, "%s%s%s", refLeft, altTrim, refRight);
         if (obs)
             obs->allele = spdiB->ins;
         spdiB->chromStart = leftStart;
         spdiB->chromEnd = rightEnd;
         }
     }
 return changed;
 }
 
 static void checkSpdisAfterExpansion(struct spdiBed *spdiBList, struct alObs *obsList)
 /* Check spdiBList and obsList for inconsistencies; if the reference allele spdiB+obs missed out
  * on the expansion then expand them. */
 {
 boolean alreadyExpandedRefAllele = FALSE;
 struct spdiBed *spdiB;
 struct alObs *obs;
 for (spdiB = spdiBList, obs = obsList;
      spdiB != NULL;
      spdiB = spdiB->next, obs = obs->next)
     {
     if (strlen(spdiB->del) != spdiB->chromEnd - spdiB->chromStart)
         errAbort("Inconsistent freq spdi coords & del: %s|%d|%d %s/%s",
                  spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, spdiB->del, spdiB->ins);
     if (spdiB != spdiBList)
         {
         if (differentString(spdiB->chrom, spdiBList->chrom))
             errAbort("Differing freq spdiB chrom: '%s' vs. '%s'",
                      spdiB->chrom, spdiBList->chrom);
         if (differentString(spdiB->del, spdiBList->del))
             {
             // Ref allele can't be shifted, so expand it to alt's shifted range... but only once,
             // so we'll catch differences between alt alleles' expanded ranges.
             struct spdiBed *spdiBRef = NULL, *spdiBAlt = NULL;
             struct alObs *obsRef = NULL;
             if (sameString(spdiBList->del, spdiBList->ins))
                 {
                 spdiBRef = spdiBList;
                 spdiBAlt = spdiB;
                 obsRef = obsList;
                 }
             else if (sameString(spdiB->del, spdiB->ins))
                 {
                 spdiBRef = spdiB;
                 spdiBAlt = spdiBList;
                 obsRef = obs;
                 }
             if (spdiBRef != NULL && !alreadyExpandedRefAllele)
                 {
                 // Copy spdiBAlt's expanded coords and del into spdiBRef
                 spdiBRef->chromStart = spdiBAlt->chromStart;
                 spdiBRef->chromEnd = spdiBAlt->chromEnd;
                 spdiBRef->del = spdiBAlt->del;
                 // The reference allele's ins is the same as its del, and the corresponding
                 // obs->allele must match it.
                 spdiBRef->ins = spdiBRef->del;
                 obsRef->allele = spdiBRef->ins;
                 alreadyExpandedRefAllele = TRUE;
                 }
             else
                 errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s",
                          spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd,
                          spdiBList->del, spdiBList->ins,
                          spdiB->chrom, spdiB->chromStart, spdiB->chromEnd,
                          spdiB->del, spdiB->ins);
             }
         }
     }
 }
 
 static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm)
 /* Return a new spdiBed that is the minimal representation of spdiB. */
 {
 struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart,
                                          lmCloneString(lm, spdiB->del),
                                          lmCloneString(lm, spdiB->ins), lm);
 int refTrimLen = 0, altTrimLen = 0;
 trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd,
            &refTrimLen, &altTrimLen);
 return spdiBTrim;
 }
 
 static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom,
                               struct hash *ncToSeqWin, struct lm *lm)
 /* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do
  * not cover the entire repetitive range.  Expand variant alleles on the sequence on which they
  * were originally reported; if there is an indel difference between the reporting assembly and
  * the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt
  * may be different for different assemblies). */
 {
 int sIx;
 for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
     {
     struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx];
     struct alObs *obs, *obsList = props->freqSourceObs[sIx];
     assert(slCount(spdiBList) == slCount(obsList));
     boolean changed = FALSE;
     // For each allele, see if it can be slid around.
     for (spdiB = spdiBList, obs = obsList;
          spdiB != NULL;
          spdiB = spdiB->next, obs = obs->next)
         {
         assert(sameString(obs->allele, spdiB->ins));
         // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly.
         if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) ||
             stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del))
             {
             // First trim ref and alt to their minimal version for indelShift to work with.
             // There may be nothing to trim because they're usually minimal already and
             // stringIn catches false positive like A/TAAC, and that's fine.
             struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
             // If possible, expand the minimal representation to full SPDI range.
             struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom);
             if (seqWin == NULL)
                 errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom);
             changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm);
             }
         }
     if (changed)
         checkSpdisAfterExpansion(spdiBList, obsList);
     }
 }
 
 static double mafFromSource(struct sharedProps *props, int sourceIx,
                             char **retMajorAllele, char **retMinorAllele, struct lm *lm)
 /* Compare allele counts from source and return minor allele frequency (2nd-highest).
  * Set *retMajorAllele to allele with highest count (for comparison with mapped ref alleles).
  * If there are no observations from source, return NAN (divide by zero). */
 {
 char *majorAllele = NULL, *minorAllele = NULL;
 int majorAlleleCount = -1, minorAlleleCount = -1;
 double totalCount = 0;
 struct alObs *obs;
 for (obs = props->freqSourceObs[sourceIx];  obs != NULL;  obs = obs->next)
     {
     if (obs->obsCount > majorAlleleCount)
         {
         minorAllele = majorAllele;
         minorAlleleCount = majorAlleleCount;
         majorAlleleCount = obs->obsCount;
         majorAllele = obs->allele;
         }
     else if (obs->obsCount > minorAlleleCount)
         {
         minorAllele = obs->allele;
         minorAlleleCount = obs->obsCount;
         }
     totalCount = obs->totalCount;
     }
 if (totalCount > 0)
     {
     if (majorAlleleCount < 0)
         errAbort("no major allele observed despite nonzero totalCount...?");
     if (minorAlleleCount < 0)
         errAbort("no minor allele observed... what should the minorAllele value be??");
     }
 *retMajorAllele = lmCloneString(lm, majorAllele);
 *retMinorAllele = lmCloneString(lm, minorAllele);
 // If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test)
 return (double)minorAlleleCount / totalCount;
 }
 
 static void sortFrequencies(struct sharedProps *props, struct lm *lm)
 /* Determine major/minor allele and per-project MAF (if given).
  * If -freqOrder option was given, then fill in props->freqSource* arrays in that order.
  * Otherwise, choose the MAF from the project with the largest sample count as a fallback. */
 {
 if (props->freqSourceCount > 0)
     {
     if (freqSourceOrder != NULL)
         {
         lmAllocArray(lm, props->freqSourceMajorAl, freqSourceCount);
         lmAllocArray(lm, props->freqSourceMinorAl, freqSourceCount);
         lmAllocArray(lm, props->freqSourceMaf, freqSourceCount);
         int sIx;
         for (sIx = 0;  sIx < freqSourceCount;  sIx++)
             {
             props->freqSourceMaf[sIx] = mafFromSource(props, sIx,
                                                       &props->freqSourceMajorAl[sIx],
                                                       &props->freqSourceMinorAl[sIx], lm);
             if (isfinite(props->freqSourceMaf[sIx]))
                 {
                 if (props->freqSourceMaf[sIx] >= 0.01)
                     props->commonCount++;
                 else
                     props->rareCount++;
                 }
             }
         }
     else
         {
         // No freqSourceOrder -- just use source with biggest totalCount for this variant.
         props->freqSourceCount = 1;
         lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount);
         lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount);
         lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount);
         props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx,
                                                 &props->freqSourceMajorAl[0],
                                                 &props->freqSourceMinorAl[0], lm);
         if (isfinite(props->freqSourceMaf[0]))
             {
             if (props->freqSourceMaf[0] >= 0.01)
                 props->commonCount++;
             else
                 props->rareCount++;
             }
         }
     }
 }
 
 static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm)
 /* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */
 {
 struct sharedProps *props = NULL;
 lmAllocVar(lm, props);
 char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm);
 char rsName[strlen(rsNumber)+3];
 safef(rsName, sizeof rsName, "rs%s", rsNumber);
 props->name = lmCloneString(lm, rsName);
 char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm);
 props->class = varTypeToClass(varType);
 struct slRef *annotationsRef = jsonQueryElement(top, "top",
                                                  "primary_snapshot_data.allele_annotations[*]", lm);
 parseFrequencies(annotationsRef, props, rsName, lm);
 spdiNormalizeFreq(props, ncToTwoBitChrom, ncToSeqWin, lm);
 sortFrequencies(props, lm);
 struct slName *soTermNames = jsonQueryStringList(annotationsRef, "annotations",
                                                  "assembly_annotation[*].genes[*].rnas[*]"
                                                  ".sequence_ontology[*].accession", lm);
 soTermNames = slCat(soTermNames,
                     jsonQueryStringList(annotationsRef, "annotations",
                                         "assembly_annotation[*].genes[*].rnas[*]"
                                         ".protein.sequence_ontology[*].accession", lm));
 props->soTerms = soTermStringIdToIdList(soTermNames, lm);
 props->maxFuncImpact = props->soTerms ? props->soTerms->val : soUnknown;
 props->submitters = jsonQueryStrings(top, "top",
                                     "primary_snapshot_data.support[id.type=subsnp].submitter_handle",
                                         lm);
 slUniqify(&props->submitters, slNameCmp, NULL);
 props->pubMedIds = jsonQueryInts(top, "top", "citations[*]", lm);
 props->clinVarAccs = jsonQueryStringList(annotationsRef, "annotations",
                                           "clinical[*].accession_version", lm);
 props->clinVarSigs = jsonQueryStringList(annotationsRef, "annotations",
                                           "clinical[*].clinical_significances[*]", lm);
 return props;
 }
 
 static void setBdsFreqData(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds,
                            struct dyString *dyUcscNotes, struct lm *lm)
 /* Use isRc, props to fill in bds->majorAllele, ->minorAllele, determine whether ref is maj/min.
  * Add notes to dyUcscNotes if applicable. */
 {
 if (props->freqSourceCount > 0)
     {
     bds->freqSourceCount = props->freqSourceCount;
     lmAllocArray(lm, bds->majorAllele, props->freqSourceCount);
     lmAllocArray(lm, bds->minorAllele, props->freqSourceCount);
     lmAllocArray(lm, bds->minorAlleleFreq, props->freqSourceCount);
     char *firstMajorAllele = NULL;
+    boolean refIsMinor = FALSE;
+    boolean diffMajor = FALSE;
     int sIx;
     for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
         {
         boolean freqIsRc = props->freqIsRc[sIx];
         bds->minorAlleleFreq[sIx] = props->freqSourceMaf[sIx];
         bds->majorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMajorAl[sIx]));
         bds->minorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMinorAl[sIx]));
         if (isfinite(bds->minorAlleleFreq[sIx]))
             {
             if (isRc != freqIsRc)
                 {
                 reverseComplement(bds->majorAllele[sIx], strlen(bds->majorAllele[sIx]));
                 reverseComplement(bds->minorAllele[sIx], strlen(bds->minorAllele[sIx]));
                 }
             char *majorAllele = bds->majorAllele[sIx];
             if (differentString(bds->ref, majorAllele))
-                dyStringAppend(dyUcscNotes, bdsRefIsMinor ",");
+                refIsMinor = TRUE;
             if (firstMajorAllele == NULL)
                 firstMajorAllele = majorAllele;
             else if (majorAllele != NULL &&
                      differentString(firstMajorAllele, majorAllele))
-                dyStringAppend(dyUcscNotes, bdsDiffMajor ",");
+                diffMajor = TRUE;
             }
         }
+    if (refIsMinor)
+        dyStringAppend(dyUcscNotes, bdsRefIsMinor ",");
+    if (diffMajor)
+        dyStringAppend(dyUcscNotes, bdsDiffMajor ",");
     }
 }
 
 static void checkRareRef(struct sharedProps *props, char *ref, boolean isRc,
                          struct dyString *dyUcscNotes)
 /* If frequency counts are reported, test whether the reference has ever been observed,
  * and if so whether it is rare, i.e. < 1%.  If so, add to dyUcscNotes. */
 {
 double rareThreshold = 0.01;
 boolean haveFrequencies = FALSE, refWasObserved = FALSE;
 double maxRefAF = 0.0;
 int ix;
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     if (props->freqSourceObs[ix] != NULL)
         haveFrequencies = TRUE;
     boolean freqIsRc = props->freqIsRc[ix];
     int refLen = strlen(ref);
     char refCpy[refLen+1];
     safecpy(refCpy, sizeof refCpy, ref);
     if (isRc != freqIsRc)
         reverseComplement(refCpy, refLen);
     struct spdiBed *spdiB;
     struct alObs *obs;
     for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
          spdiB != NULL && obs != NULL;
          spdiB = spdiB->next, obs = obs->next)
         {
         if (sameString(refCpy, spdiB->ins))
             {
             if (obs->obsCount > 0)
                 {
                 refWasObserved = TRUE;
                 double refAF = (double)obs->obsCount / (double)obs->totalCount;
                 if (refAF > maxRefAF)
                     maxRefAF = refAF;
                 }
             break;
             }
         }
     if (maxRefAF >= rareThreshold)
         break;
     }
 if (haveFrequencies && maxRefAF < rareThreshold)
     {
     dyStringAppend(dyUcscNotes, bdsRefIsRare ",");
     if (!refWasObserved)
         dyStringAppend(dyUcscNotes, bdsRefIsSingleton ",");
     }
 }
 
 static enum bigDbSnpClass bigDbSnpClassFromAlleles(struct bigDbSnp *bds)
 /* Return what we expect bigDbSnpClass to be based on allele lengths. */
 {
 enum bigDbSnpClass alleleClass = 0;
 if (bds->altCount == 0)
     alleleClass = bigDbSnpIdentity;
 else
     {
     int refLen = strlen(bds->ref);
     boolean altLenZero = TRUE;
     boolean altLenLtRef = FALSE;
     boolean altLenEqRef = FALSE;
     boolean altLenGtRef = FALSE;
     int i;
     for (i = 0;  i < bds->altCount;  i++)
         {
         int altLen = strlen(bds->alts[i]);
         if (altLen < refLen)
             altLenLtRef = TRUE;
         else if (altLen == refLen)
             altLenEqRef = TRUE;
         else if (altLen > refLen)
             altLenGtRef = TRUE;
         if (altLen > 0)
             altLenZero = FALSE;
         }
     if (!altLenLtRef &&  altLenEqRef && !altLenGtRef)
         {
         if (refLen == 1)
             alleleClass = bigDbSnpSnv;
         else
             alleleClass = bigDbSnpMnv;
         }
     else if (bds->altCount == 0)
         alleleClass = bigDbSnpIdentity;
     else if (refLen == 0)
         alleleClass = bigDbSnpIns;
     else if (altLenZero)
         alleleClass = bigDbSnpDel;
     else
         alleleClass = bigDbSnpDelins;
     }
 return alleleClass;
 }
 
 static void checkIndelPlacement(struct bigDbSnp *bds, struct seqWindow *seqWin,
                                 struct lm *lm)
 /* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that
  * bds's SPDI-based coords cover the whole repetitive region as expected. */
 {
 int i;
 for (i = 0;  i < bds->altCount;  i++)
     {
     char *alt = bds->alts[i];
     if (isEmpty(bds->ref) || isEmpty(alt) ||
         stringIn(bds->ref, alt) || stringIn(alt, bds->ref))
         {
         // First trim ref and alt to their minimal version for indelShift to work with.
         // There may be nothing to trim because they're usually minimal already and
         // stringIn catches false positive like A/TAAC, and that's fine.
         struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm);
         struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
 
         maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm);
         if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd)
             errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') "
                      "could be expanded to %s|%d|%d\n",
                      bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt,
                      bds->chrom, spdiB->chromStart, spdiB->chromEnd);
         }
     }
 }
 
 static void addClinVarSigs(struct dyString *dyUcscNotes, struct sharedProps *props)
 /* If clinVarSigs indicate benign, pathogenic, or both (conflicting), add ucscNote. */
 {
 boolean isBenign = FALSE, isPathogenic = FALSE;
 struct slName *sig;
 for (sig = props->clinVarSigs;  sig != NULL;  sig = sig->next)
     {
     if (sameString(sig->name, "likely-benign") ||
              sameString(sig->name, "benign") ||
              sameString(sig->name, "benign-likely-benign"))
         {
         isBenign = TRUE;
         }
     else if (sameString(sig->name, "pathogenic") ||
              sameString(sig->name, "likely-pathogenic") ||
              sameString(sig->name, "pathogenic-likely-pathogenic"))
         {
         isPathogenic = TRUE;
         }
     else if (sameString(sig->name, "conflicting-interpretations-of-pathogenicity"))
         {
         isBenign = TRUE;
         isPathogenic = TRUE;
         break;
         }
     }
 if (isBenign && isPathogenic)
     dyStringAppend(dyUcscNotes, bdsClinvarConflicting ",");
 else if (isBenign)
     dyStringAppend(dyUcscNotes, bdsClinvarBenign ",");
 else if (isPathogenic)
     dyStringAppend(dyUcscNotes, bdsClinvarPathogenic ",");
 }
 
 static boolean delMismatchesGenome(struct bigDbSnp *bds, struct seqWindow *seqWin)
 /* Return TRUE if bds->ref (spdi del) does not match assembly sequence at bds coords.
  * Sometimes the genome has an N and dbSNP has a more specific IUPAC character.
  * errAbort if there's some other kind of inconsistency. */
 {
 int refLen = strlen(bds->ref);
 uint refStart = bds->chromStart;
 uint refEnd = bds->chromEnd;
 if (refLen != refEnd - refStart)
     errAbort("Inconsistent ref and coords for %s: ref is '%s' (%d bases) "
              "but position is %s|%d|%d (%d bases)",
              bds->name, bds->ref, refLen, bds->chrom, refStart, refEnd, refEnd - refStart);
 char genomeRef[refLen+1];
 seqWindowCopy(seqWin, refStart, refLen, genomeRef, sizeof genomeRef);
 if (differentString(genomeRef, bds->ref))
     {
     if (strchr(genomeRef, 'N'))
         return TRUE;
     else
         errAbort("bds->ref '%s' mismatches genome sequence '%s'", bds->ref, genomeRef);
     }
 return FALSE;
 }
 
 static boolean hasAmbiguousFreqAllele(struct sharedProps *props)
 /* Return TRUE if any allele from any frequency source contains an IUPAC ambiguous character. */
 {
 int sIx;
 for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
     {
     struct alObs *obs;
     for (obs = props->freqSourceObs[sIx];  obs != NULL;  obs = obs->next)
         if (anyIupac(obs->allele))
             return TRUE;
     }
 return FALSE;
 }
 
 static boolean freqHasNonRefAltAllele(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds)
 /* Return TRUE if any allele from any frequency source does not match any ref/alt (SPDI) allele. */
 {
 int sIx;
 for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
     {
     boolean freqIsRc = props->freqIsRc[sIx];
     struct alObs *obs;
     for (obs = props->freqSourceObs[sIx];  obs != NULL;  obs = obs->next)
         {
         int alLen = strlen(obs->allele);
         char alCpy[alLen+1];
         safecpy(alCpy, sizeof alCpy, obs->allele);
         if (freqIsRc != isRc)
             reverseComplement(alCpy, alLen);
         boolean foundIt = FALSE;
         if (sameString(alCpy, bds->ref))
             foundIt = TRUE;
         else
             {
             int i;
             for (i = 0;  i < bds->altCount;  i++)
                 if (sameString(alCpy, bds->alts[i]))
                     {
                     foundIt = TRUE;
                     break;
                     }
             }
         if (!foundIt)
             return TRUE;
         }
     }
 return FALSE;
 }
 
 static void addUcscNotes(struct sharedProps *props, boolean isRc, boolean isMultiMapper,
                          struct bigDbSnp *bds, struct seqWindow *seqWin,
                          struct dyString *dyUcscNotes, struct lm *lm)
 /* Record interesting conditions if applicable. */
 {
 if (props->class != bigDbSnpClassFromAlleles(bds))
     dyStringAppend(dyUcscNotes, bdsClassMismatch ",");
 if (props->clinVarAccs != NULL)
     dyStringAppend(dyUcscNotes, bdsClinvar ",");
 addClinVarSigs(dyUcscNotes, props);
 if (props->commonCount > 0)
     {
     dyStringAppend(dyUcscNotes, bdsCommonSome ",");
     if (props->rareCount == 0)
         dyStringAppend(dyUcscNotes, bdsCommonAll ",");
     else
         dyStringAppend(dyUcscNotes, bdsRareSome ",");
     }
 else if (props->rareCount > 0 || props->freqSourceCount == 0)
     {
     dyStringAppend(dyUcscNotes, bdsRareSome ",");
     dyStringAppend(dyUcscNotes, bdsRareAll ",");
     }
 if (isRc)
     dyStringAppend(dyUcscNotes, bdsRevStrand ",");
 if (isMultiMapper)
     dyStringAppend(dyUcscNotes, bdsMultiMap ",");
 checkRareRef(props, bds->ref, isRc, dyUcscNotes);
 if (delMismatchesGenome(bds, seqWin))
     dyStringAppend(dyUcscNotes, bdsRefMismatch ",");
 if (anyIupac(bds->ref))
     dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ",");
 int i;
 for (i = 0;  i < bds->altCount;  i++)
     if (anyIupac(bds->alts[i]))
         dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ",");
 if (hasAmbiguousFreqAllele(props))
     dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ",");
 if (freqHasNonRefAltAllele(props, isRc, bds))
     dyStringAppend(dyUcscNotes, bdsFreqNotRefAlt ",");
 bds->ucscNotes = lmCloneString(lm, dyUcscNotes->string);
 }
 
 static void calcShiftBases(struct bigDbSnp *bds)
 /* Return the max length of uncertain placement from all alt alleles. */
 {
 int maxBasesTrimmed = 0;
 int refLen = bds->chromEnd - bds->chromStart;
 char refTrim[refLen+1];
 int i;
 for (i = 0;  i < bds->altCount;  i++)
     {
     // Make copies of ref and alt, trim the copies, measure the change.
     int refTrimLen = refLen;
     safecpy(refTrim, sizeof refTrim, bds->ref);
     int altTrimLen = strlen(bds->alts[i]);
     char altTrim[altTrimLen+1];
     safecpy(altTrim, sizeof altTrim, bds->alts[i]);
     uint start = bds->chromStart, end = bds->chromEnd;
     trimRefAlt(refTrim, altTrim, &start, &end, &refTrimLen, &altTrimLen);
     int basesTrimmed = refLen - refTrimLen;
     if (basesTrimmed > maxBasesTrimmed)
         maxBasesTrimmed = basesTrimmed;
     if (maxBasesTrimmed >= refLen)
         break;
     }
 bds->shiftBases = min(refLen, maxBasesTrimmed);
 }
 
 static struct bigDbSnp *parsePlacement(struct jsonElement *pl, struct sharedProps *props,
                                        boolean isMultiMapper, struct dyString *dy, struct lm *lm)
 /* Return a bigDbSnp with information about one placement (mapping). */
 {
 struct bigDbSnp *bds;
 lmAllocVar(lm, bds);
 struct slRef *allSpdiRef = jsonQueryElement(pl, "placement", "alleles[*].allele.spdi", lm);
 bds->chrom = jsonQueryString(pl, "placement", "seq_id", lm);
 if (bds->chrom == NULL)
     errAbort("Missing or NULL seq_id for %s", props->name);
 struct spdiBed *spdiBList = parseSpdis(allSpdiRef, props->name, lm);
 if (!sameString(bds->chrom, spdiBList->chrom))
     errAbort("seq_id mismatch: placement has '%s' but spdi has '%s for %s'",
              bds->chrom, spdiBList->chrom, props->name);
 bds->chromStart = spdiBList->chromStart;
 bds->chromEnd = spdiBList->chromEnd;
 bds->name = props->name;
 bds->ref = spdiBList->del;
 // spdiBList may or may not include no-change allele (del == ins)
 lmAllocArray(lm, bds->alts, slCount(spdiBList));
 struct spdiBed *spdiB;
 for (spdiB = spdiBList;  spdiB != NULL;  spdiB = spdiB->next)
     {
     if (!sameString(bds->ref, spdiB->ins))
         bds->alts[bds->altCount++] = spdiB->ins;
     }
 calcShiftBases(bds);
 boolean isRc = jsonQueryBoolean(pl, "placement",
                                 "placement_annot.is_aln_opposite_orientation", FALSE, lm);
 dyStringClear(dy);
 setBdsFreqData(props, isRc, bds, dy, lm);
 bds->maxFuncImpact = props->maxFuncImpact;
 bds->class = props->class;
 struct seqWindow *seqWin = getChromSeq(ncToSeqWin, bds->chrom, ncToTwoBitChrom);
 checkIndelPlacement(bds, seqWin, lm);
 addUcscNotes(props, isRc, isMultiMapper, bds, seqWin, dy, lm);
 return bds;
 }
 
 static void equivRegionAdd(struct hash *equivRegions, struct bed3 *region, struct bed3 *regionList)
 /* Associate regionList with region by looking up the binKeeper for region->chrom in equivRegions
  * and adding regionList to the binKeeper for region start & end. */
 {
 struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
 if (bk == NULL)
     {
     bk = binKeeperNew(0, BINRANGE_MAXEND_512M);
     hashAdd(equivRegions, region->chrom, bk);
     }
 binKeeperAdd(bk, region->chromStart, region->chromEnd, regionList);
 }
 
 static struct hash *getEquivRegions()
 /* If not done already, read -equivRegions file if specified and parse it into data structure
  * of equivalent regions for detecting legitimate cases of multiple mappings. */
 {
 static struct hash *equivRegions = NULL;
 static boolean alreadyInited = FALSE;
 if (!alreadyInited)
     {
     char *equivRegionsFile = optionVal("equivRegions", NULL);
     if (isNotEmpty(equivRegionsFile))
         {
         equivRegions = hashNew(0);
         struct lineFile *lf = lineFileOpen(equivRegionsFile, TRUE);
         char *line;
         while (lineFileNextReal(lf, &line))
             {
             // Build a list of equivalent regions; hash each region->chrom to list.
             char *words[128];
             int wordCount = chopLine(line, words);
             if (wordCount >= ArraySize(words))
                 lineFileAbort(lf, "getEquivRegions: too many words per line (%d)", wordCount);
             struct bed3 *regionList = NULL;
             int i;
             for (i = 0;  i < wordCount;  i++)
                 {
                 char *chrom = cloneString(words[i]);
                 char *colon = strchr(chrom, ':');
                 if (colon)
                     {
                     char *startStr = colon+1;
                     char *nextColon = strchr(startStr, ':');
                     if (! nextColon)
                         lineFileAbort(lf, "getEquivRegions: expected each word to have "
                                       "0 or 2 colons, but got '%s'", words[i]);
                     *colon = '\0';
                     *nextColon = '\0';
                     char *endStr = nextColon+1;
                     if (!isAllDigits(startStr) || !isAllDigits(endStr))
                         lineFileAbort(lf, "getEquivRegions: expected a number following each colon, "
                                       "but got '%s'", words[i]);
                     int start = atoi(startStr);
                     int end = atoi(endStr);
                     slAddHead(&regionList, bed3New(chrom, start, end));
                     }
                 else
                     slAddHead(&regionList, bed3New(chrom, 0, 0));
                 }
             // Sort so that a main chromosome goes before alts/fixes.
             slSort(&regionList, bedCmp);
             // Associate all regions with the same list.  we can use pointer equality later.
             struct bed3 *region;
             for (region = regionList;  region != NULL;  region = region->next)
                 equivRegionAdd(equivRegions, region, regionList);
             }
         lineFileClose(&lf);
         }
     else
         warn("-equivRegions=FILE not specified; will not be able to detect variants "
              "that are mapped to multiple non-equivalent regions.");
     alreadyInited = TRUE;
     }
 return equivRegions;
 }
 
 static struct slRef *equivRegionFind(struct hash *equivRegions, struct bed3 *region,
                                      struct lm *lm)
 /* Return ref list of equivalent region lists that overlap region. */
 {
 struct binElement *beList = NULL;
 struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
 if (bk)
     {
     int start = region->chromStart;
     int end = region->chromEnd;
     // binKeeperFind can't find overlap with 0-length insertions (start == end) so expand those:
     if (start == end)
         {
         start--;
         end++;
         }
     beList = binKeeperFind(bk, start, end);
     }
 struct slRef *refList = NULL;
 struct binElement *binEl;
 for (binEl = beList;  binEl != NULL;  binEl = binEl->next)
     lmRefAdd(lm, &refList, binEl->val);
 slReverse(&refList);
 slFreeList(&beList);
 return refList;
 }
 
 static struct bed3 *bed3FromPlRef(struct slRef *plRef, struct lm *lm)
 /* Extract a bed3 from the first SPDI found in the first placement in plRef. */
 {
 struct jsonElement *pl = plRef->val;
 struct slRef *spdiRef = jsonQueryElement(pl, "placement", "alleles[0].allele.spdi", lm);
 struct jsonElement *spdiEl = spdiRef->val;
 struct bed3 *region;
 lmAllocVar(lm, region);
 region->chrom = jsonQueryString(spdiEl, "first spdi", "seq_id", lm);
 region->chromStart = jsonQueryInt(spdiEl, "first spdi", "position", -1, lm);
 char *ref = jsonQueryString(spdiEl, "first spdi", "deleted_sequence", lm);
 region->chromEnd = region->chromStart + strlen(ref);
 return region;
 }
 
 static struct bed3 *bed3ListFromPlRef(struct slRef *plRefList, struct lm *lm)
 /* Return a list of bed3 derived from placement SPDIs in plRef. */
 {
 struct bed3 *list = NULL;
 struct slRef *plRef;
 for (plRef = plRefList;  plRef != NULL;  plRef = plRef->next)
     slAddHead(&list, bed3FromPlRef(plRef, lm));
 slReverse(&list);
 return list;
 }
 
 static boolean uniqChroms(struct bed3 *bedList)
 /* Return TRUE if each item of *sorted* bedList has a different chrom. */
 {
 boolean uniq = TRUE;
 if (bedList != NULL)
     {
     struct bed3 *bed;
     for (bed = bedList;  bed->next != NULL;  bed = bed->next)
         if (sameString(bed->chrom, bed->next->chrom))
             {
             uniq = FALSE;
             break;
             }
     }
 return uniq;
 }
 
 static boolean allMappingsHaveMatch(struct slRef *equivRegionsA[], struct slRef *equivRegionsB[],
                                     int plCount)
 // Return true if each placement's list of equivRegion sets in A has at least one set that
 // matches at least one set from some other placement's list in B.  A and B may/may not be the same.
 {
 int i;
 for (i = 0;  i < plCount;  i++)
     {
     struct slRef *plEquivList = equivRegionsA[i];
     boolean foundMatch = FALSE;
     int j;
     for (j = 0;  j < plCount;  j++)
         {
         if (j == i)
             continue;
         struct slRef *otherEquivList = equivRegionsB[j];
         struct slRef *plEquiv;
         for (plEquiv = plEquivList;  plEquiv != NULL;  plEquiv = plEquiv->next)
             {
             struct slRef *otherEquiv;
             for (otherEquiv = otherEquivList;  otherEquiv;  otherEquiv = otherEquiv->next)
                 {
                 // We can use pointer equality because getEquivRegions assigns same list
                 // pointer to each region in list.
                 if (plEquiv->val == otherEquiv->val)
                     {
                     foundMatch = TRUE;
                     break;
                     }
                 }
             }
         }
     if (! foundMatch)
         return FALSE;
     }
 return TRUE;
 }
 
 static boolean detectMultiMappers(struct slRef *placements, struct lm *lm)
 /* Return TRUE if placements map to more than one genomic locus, aside from legitimate cases
  * like the PAR regions on X and Y, or alt/fix sequences and their corresponding chromosomal
  * locations. */
 {
 struct hash *equivRegions = getEquivRegions();
 int plCount = slCount(placements);
 if (equivRegions && plCount > 1)
     {
     // Get a bed3 list of placements and make sure there aren't multiple mappings on the same chrom.
     struct bed3 *bedList = bed3ListFromPlRef(placements, lm);
     slSort(&bedList, bedCmp);
     if (!uniqChroms(bedList))
         return TRUE;
     // Gather up equivRegions lists for all placements.  If any placement has no equivRegions,
     // then we're done.
     struct slRef *plEquivRegions[plCount];
     struct bed3 *bed;
     int i;
     for (i = 0, bed = bedList;  bed != NULL;  bed = bed->next, i++)
         {
         struct slRef *listOfLists = equivRegionFind(equivRegions, bed, lm);
         if (listOfLists == NULL)
             return TRUE;
         plEquivRegions[i] = listOfLists;
         }
     if (!allMappingsHaveMatch(plEquivRegions, plEquivRegions, plCount))
         {
         // Sometimes the test for equivalent regions fails because all regions point to a
         // common location -- but that location does not have a mapping, so its suite of
         // regions are not in the mix.  For example, a variant is mapped to 4 alts of the
         // same chrom -- but not to the chrom itself, so all regions point to the main chrom,
         // but none of the 4 mappings' regions are in common with each other.
         // So try harder before giving up: add equivalent regions that overlap original equivs.
         struct slRef *plExpandedEquivs[plCount];
         for (i = 0, bed = bedList;  bed != NULL;  bed = bed->next, i++)
             {
             struct slRef *expandedEquivs = NULL;
             struct slRef *plEquiv;
             for (plEquiv = plEquivRegions[i];  plEquiv != NULL;  plEquiv = plEquiv->next)
                 {
                 struct slRef *setRef;
                 for (setRef = plEquiv;  setRef != NULL;  setRef = setRef->next)
                     {
                     struct bed3 *set = setRef->val;
                     struct bed3 *region;
                     for (region = set;  region != NULL;  region = region->next)
                         {
                         if (differentString(bed->chrom, region->chrom))
                             expandedEquivs = slCat(equivRegionFind(equivRegions, region, lm),
                                                    expandedEquivs);
                         }
                     }
                 }
             plExpandedEquivs[i] = expandedEquivs;
             }
         if (!allMappingsHaveMatch(plEquivRegions, plExpandedEquivs, plCount))
             return TRUE;
         }
     }
 return FALSE;
 }
 
 static void dyStringPrintSlNameList(struct dyString *dy, struct slName *list, char *sep)
 /* Append list of names with separator to dy; separator appears after every item. Skip empty names. */
 {
 struct slName *item;
 for (item = list;  item != NULL;  item = item->next)
     {
     if (isNotEmpty(item->name))
         {
         dyStringAppend(dy, item->name);
         dyStringAppend(dy, sep);
         }
     }
 }
 
 static void dyStringPrintSlIntList(struct dyString *dy, struct slInt *list, char *sep)
 /* Append list of names with separator to dy; separator appears after every item. */
 {
 struct slInt *item;
 for (item = list;  item != NULL;  item = item->next)
     {
     dyStringPrintf(dy, "%d", item->val);
     dyStringAppend(dy, sep);
     }
 }
 
 static void appendFrequencies(struct sharedProps *props, struct dyString *dy)
 /* Condense the frequency data in props into an autoSql array of compact strings and append to dy.
  * Also add autoSql int array of total_count because those may vary between variants. */
 {
 dyStringPrintf(dy, "%d\t", props->freqSourceCount);
 int ix;
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     boolean freqIsRc = props->freqIsRc[ix];
     struct alObs *obs;
     for (obs = props->freqSourceObs[ix];  obs != NULL;  obs = obs->next)
         {
         if (obs != props->freqSourceObs[ix])
             dyStringAppendC(dy, '|');
         if (freqIsRc)
             // Store all frequencies on + strand of "preferred top level placement"
             reverseComplement(obs->allele, strlen(obs->allele));
         dyStringPrintf(dy, "%s:%d", obs->allele, obs->obsCount);
         if (freqIsRc)
             // rc again to restore original values
             reverseComplement(obs->allele, strlen(obs->allele));
         }
     dyStringAppendC(dy, ',');
     }
 dyStringAppendC(dy, '\t');
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     if (props->freqSourceObs[ix] != NULL)
         dyStringPrintf(dy, "%d,", props->freqSourceObs[ix]->totalCount);
     else
         dyStringAppend(dy, "0,");
     }
 }
 
 static void writeDetails(struct sharedProps *props, struct dyString *dy, FILE *outPropsF)
 /* Write props out as tab-sep line to outPropsF. */
 {
 dyStringClear(dy);
 dyStringPrintf(dy, "%s\t", props->name);
 appendFrequencies(props, dy);
 dyStringPrintf(dy, "\t%d\t", slCount(props->soTerms));
 dyStringPrintSlIntList(dy, props->soTerms, ",");
 dyStringPrintf(dy, "\t%d\t", slCount(props->clinVarAccs));
 dyStringPrintSlNameList(dy, props->clinVarAccs, ",");
 dyStringAppendC(dy, '\t');
 dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
 dyStringPrintf(dy, "\t%d\t", slCount(props->submitters));
 dyStringPrintSlNameList(dy, props->submitters, ",");
 dyStringPrintf(dy, "\t%d\t", slCount(props->pubMedIds));
 dyStringPrintSlIntList(dy, props->pubMedIds, ",");
 dyStringAppendC(dy, '\n');
 fputs(dy->string, outPropsF);
 }
 
 static void dumpLineToFile(char *line, FILE *jsonF)
 /* Write line to jsonF, making sure it ends in \n. */
 {
 fputs(line, jsonF);
 if (line[strlen(line)-1] != '\n')
     fputc('\n', jsonF);
 }
 
 static void updateSeqIsRc(struct slRef *placements, struct lm *lm)
 /* Note is_aln_opposite_orientation flag for each mapped sequence, regardless of whether it's
  * in the assembly that we're working on; later we can look up sequences from freq reports. */
 {
 hashIntReset(seqIsRc);
 struct slRef *plRef;
 for (plRef = placements;  plRef != NULL;  plRef = plRef->next)
     {
     char *seqId = jsonQueryString(plRef->val, "placement", "seq_id", lm);
     boolean isRc = jsonQueryBoolean(plRef->val, "placement",
                                     "placement_annot.is_aln_opposite_orientation", FALSE, lm);
     struct hashEl *hel = hashLookup(seqIsRc, seqId);
     if (hel)
         {
         if (isRc != ptToInt(hel->val))
             errAbort("updateSeqIsRc: sequence '%s' has both true and false for "
                      "placement_annot.is_aln_opposite_orientation", seqId);
         }
     else
         hashAddInt(seqIsRc, seqId, isRc);
     }
 }
 
 struct perAssembly
 // For each assembly whose mappings we're extracting from JSON, we need the name,
 // a jsonQuery path to extract just the mappings for that assembly, and an output file.
     {
     struct perAssembly *next;
     char *name;                  // assembly_name e.g. GRCh38.p12
     char *placementPath;         // jsonQuery path to placements on assembly
     char *plIsTopLevelPath;      // jsonQuery path to find whether placement is top level in assembly
     FILE *outF;                  // bigDbSnp output file
     };
 
 static void perAssemblyFree(struct perAssembly **pPa)
 /* Close *pPa's outF and free mem. */
 {
 if (pPa)
     {
     struct perAssembly *pa = *pPa;
     freeMem(pa->name);
     carefulClose(&pa->outF);
     freeMem(pa->placementPath);
     freeMem(pa->plIsTopLevelPath);
     freez(pPa);
     }
 }
 
 static void perAssemblyFreeList(struct perAssembly **pPaList)
 /* Close & free a list of perAssembly. */
 {
 if (pPaList)
     {
     struct perAssembly *el, *next;
     for (el = *pPaList;  el != NULL;  el = next)
         {
         next = el->next;
         perAssemblyFree(&el);
         }
     }
 }
 
 static void writeMerged(struct jsonElement *top, FILE *outMergedF, struct lm *lm)
 /* If any merged IDs are given, write the mappings out to outMergedF. */
 {
 struct slName *mergedIds = jsonQueryStrings(top, "top", "dbsnp1_merges[*].merged_rsid", lm);
 if (mergedIds)
     {
     char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
     struct slName *mergedId;
     for (mergedId = mergedIds;  mergedId != NULL;  mergedId = mergedId->next)
         fprintf(outMergedF, "rs%s\trs%s\n", mergedId->name, rsId);
     }
 }
 
 static char *getChrom(char *refSeqAcc, char *assembly)
 /* Look up "refSeqAcc assembly" in ncGrcToChrom; if found, return chrom, otherwise return
  * refSeqAcc for translation later (e.g. NC_012920.1 liftOver to hg19 chrM).  Don't free ret. */
 {
 char ncGrc[strlen(refSeqAcc) + 1 + strlen(assembly) + 1];
 safef(ncGrc, sizeof ncGrc, "%s %s", refSeqAcc, assembly);
 char *chrom = hashFindVal(ncGrcToChrom, ncGrc);
 return chrom ? chrom : refSeqAcc;
 }
 
 static void parseDbSnpJson(char *line, struct perAssembly *assemblyProps,
                            struct outStreams *outStreams)
 /* Each line of dbSNP's file contains one JSON blob describing an rs# variant.  Parse JSON,
  * extract the bits that we want to keep, print out BED+ and accompanying files. */
 {
 struct lm *lm = lmInit(1<<16);
 struct jsonElement *top = jsonParseLm(line, lm);
 struct sharedProps *props = NULL;
 struct bigDbSnp *bds = NULL;
 struct slRef *allPlacements = jsonQueryElement(top, "top",
                                                "primary_snapshot_data.placements_with_allele[*]",
                                                lm);
 updateSeqIsRc(allPlacements, lm);
 struct dyString *dyScratch = dyStringNew(0);
 struct perAssembly *ap;
 for (ap = assemblyProps;  ap != NULL;  ap = ap->next)
     {
     struct slRef *placements = jsonQueryElement(top, "top", ap->placementPath, lm);
     // Filter out scaffolds that are independent in some other assembly, but part of a regular
     // chrom in the current assembly
     struct slRef *plRef, *plRefNext, *filtered = NULL;
     for (plRef = placements;  plRef != NULL;  plRef = plRefNext)
         {
         plRefNext = plRef->next;
         struct jsonElement *pl = plRef->val;
         boolean isTopLevel = jsonQueryBoolean(pl, "placement", ap->plIsTopLevelPath, FALSE, lm);
         if (isTopLevel)
             slAddHead(&filtered, plRef);
         }
     slReverse(&filtered);
     placements = filtered;
     boolean multiMapper = detectMultiMappers(placements, lm);
     struct errCatch *errCatch = errCatchNew();
 //#*** NOTE: currently in some cases the alleles are incorrect on alt sequences where the
 //#*** alt has a different allele of the same variant (or just different sequence).
 //#*** We detect at least some of those by inconsistent pos/del in alt placement spdis,
 //#*** but who knows, there could be more.  Ultimately we will need to compare assembly
 //#*** sequences... unless/until dbSNP fixes their projection onto alts (JIRA VR-176).
     if (errCatchStart(errCatch))
         {
         for (plRef = placements;  plRef != NULL;  plRef = plRef->next)
             {
             struct jsonElement *pl = plRef->val;
             if (props == NULL)
                 {
                 // Write props only if we have at least one placement.
                 props = extractProps(top, lm);
                 writeDetails(props, dyScratch, outStreams->details);
                 writeMerged(top, outStreams->merged, lm);
                 }
             bds = parsePlacement(pl, props, multiMapper, dyScratch, lm);
             bds->chrom = getChrom(bds->chrom, ap->name);
             bigDbSnpTabOut(bds, ap->outF);
             // Prevent stale bds in errCatch below:
             bds = NULL;
             }
         }
     errCatchEnd(errCatch);
     if (errCatch->gotError)
         {
         dumpLineToFile(line, outStreams->failJson);
         char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
         char *seqId = bds ? bds->chrom : "NOSEQ";
         fprintf(outStreams->err, "%s\t%s\t%s", rsId, seqId, errCatch->message->string);
         }
     else if (isNotEmpty(errCatch->message->string))
         warn("%s", errCatch->message->string);
     errCatchFree(&errCatch);
     }
 dyStringFree(&dyScratch);
 lmCleanup(&lm);
 }
 
 static struct slName *initFreqSourceOrder()
 /* If -freqSourceOrder option was given, extract source names from the comma-sep list
  * into globals freqSourceCount and freqSourceOrder[]. */
 {
 char *freqSourceOrderStr = optionVal("freqSourceOrder", NULL);
 struct slName *sources = NULL;
 if (freqSourceOrderStr)
     {
     sources = slNameListFromComma(freqSourceOrderStr);
     freqSourceCount = slCount(sources);
     AllocArray(freqSourceOrder, freqSourceCount);
     struct slName *source;
     int i;
     for (i = 0, source = sources;  i < freqSourceCount;  i++, source = source->next)
         freqSourceOrder[i] = source->name;
     }
 return sources;
 }
 
 static struct perAssembly *getAssemblyProps(char *assemblyList, char *outRoot)
 /* Split comma-sep assemblyList and make placement path and bigDbSnp output file handle for each. */
 {
 struct perAssembly *assemblyProps = NULL;
 struct slName *assembly, *assemblies = slNameListFromComma(assemblyList);
 for (assembly = assemblies;  assembly != NULL;  assembly = assembly->next)
     {
     struct perAssembly *ap;
     AllocVar(ap);
     ap->name = cloneString(assembly->name);
     // Fancy path to get all placements that are on the desired assembly.
     struct dyString *dy = dyStringCreate("primary_snapshot_data.placements_with_allele"
                                   "[placement_annot.seq_id_traits_by_assembly[*].assembly_name=%s]",
                                          assembly->name);
     ap->placementPath = dyStringCannibalize(&dy);
     dy = dyStringCreate("placement_annot.seq_id_traits_by_assembly[assembly_name=%s].is_top_level",
                         assembly->name);
     ap->plIsTopLevelPath = dyStringCannibalize(&dy);
     char prefix[2048];
     safef(prefix, sizeof prefix, "%s.%s", outRoot, assembly->name);
     ap->outF = openOutFile("%s.bigDbSnp", prefix);
     slAddHead(&assemblyProps, ap);
     }
 return assemblyProps;
 }
 
 static void parseRefSeqToUcsc(char *refSeqToUcsc, struct hash *ncGrcToChrom,
                               struct hash *ncToTwoBitChrom)
 /* Extract mappings of {NC_*, GRC*} to chrom and NC_* to {twoBitFile, chrom}. */
 {
 struct lineFile *lf = lineFileOpen(refSeqToUcsc, TRUE);
 char *line;
 while (lineFileNextReal(lf, &line))
     {
     char *row[5];
     int wordCount = chopTabs(line, row);
     lineFileExpectWords(lf, 4, wordCount);
     char *acc = row[0];
     char *assembly = row[1];
     char *twoBitFile = row[2];
     char *chrom = row[3];
     if (!regexMatch(acc, "^N[A-Z]_[0-9]+\\.[0-9]+$"))
         lineFileAbort(lf, "'%s' does not look like a RefSeq sequence accession "
                       "(expecting N*_number.number in first column)", acc);
     if (!startsWith("GRCh", assembly))
         lineFileAbort(lf, "'%s' does not look like a GRCh* assembly name "
                       "(expecting GRCh* in second column", assembly);
     if (!endsWith(twoBitFile, ".2bit"))
         lineFileAbort(lf, "'%s' does not look like a twoBit file "
                       "(expecting *.2bit in third column)", twoBitFile);
     if (!regexMatch(chrom, "^chr[0-9A-Za-z_]+$"))
         lineFileAbort(lf, "'%s' does not look like an hg* chrom name "
                       "(expecting 'chr' + alphanumeric/_ in fourth column)", chrom);
     char accAssembly[strlen(acc) + 1 + strlen(assembly) + 1];
     safef(accAssembly, sizeof accAssembly, "%s %s", acc, assembly);
     hashAdd(ncGrcToChrom, accAssembly, cloneString(chrom));
     char twoBitChrom[strlen(twoBitFile) + 1 + strlen(chrom) + 1];
     safef(twoBitChrom, sizeof twoBitChrom, "%s:%s", twoBitFile, chrom);
     hashAdd(ncToTwoBitChrom, acc, cloneString(twoBitChrom));
     }
 lineFileClose(&lf);
 }
 
 void dbSnpJsonToTab(char *assemblyList, char *refSeqToUcsc, char *jsonFile, char *outRoot)
 /* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
 {
 struct slName *sources = initFreqSourceOrder();
 struct lineFile *lf = lineFileOpen(jsonFile, TRUE);
 struct outStreams *outStreams = outStreamsOpen(outRoot);
 struct perAssembly *assemblyProps = getAssemblyProps(assemblyList, outRoot);
 seqIsRc = hashNew(0);
 ncGrcToChrom = hashNew(0);
 ncToTwoBitChrom = hashNew(0);
 ncToSeqWin = hashNew(0);
 parseRefSeqToUcsc(refSeqToUcsc, ncGrcToChrom, ncToTwoBitChrom);
 char *line = NULL;
 while (lineFileNext(lf, &line, NULL))
     parseDbSnpJson(line, assemblyProps, outStreams);
 lineFileClose(&lf);
 perAssemblyFreeList(&assemblyProps);
 outStreamsClose(&outStreams);
 // Free these up for squeaky-clean valgrind --leak-check
 // [well, except for parseOptions and errCatch getStack/getThreadVars, but whatever]
 hashFree(&seqIsRc);
 hashFree(&ncGrcToChrom);
 hashFree(&ncToTwoBitChrom);
 hashFree(&ncToSeqWin);
 slNameFreeList(&sources);
 }
 
 int main(int argc, char *argv[])
 /* Process command line. */
 {
 optionInit(&argc, argv, options);
 if (argc != 5)
     usage();
 dbSnpJsonToTab(argv[1], argv[2], argv[3], argv[4]);
 return 0;
 }