428a47dfc975f02f3c3a97b3273d7b95f9789ff4 angie Tue Oct 1 13:12:08 2019 -0700 New util dbSnpJsonToTab converts JSON to bigDbSnp and dbSnpDetails files; checkBigDbSnp flags clustering anomalies. refs #23283 diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c new file mode 100644 index 0000000..d8ea261 --- /dev/null +++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c @@ -0,0 +1,1738 @@ +/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */ + +/* Copyright (C) 2019 The Regents of the University of California */ + +#include "common.h" +#include "basicBed.h" +#include "bigDbSnp.h" +#include "binRange.h" +#include "dnaseq.h" +#include "dnautil.h" +#include "errCatch.h" +#include "hash.h" +#include "indelShift.h" +#include "iupac.h" +#include "jsonQuery.h" +#include "linefile.h" +#include "obscure.h" +#include "options.h" +#include "regexHelper.h" +#include "soTerm.h" +#include "twoBit.h" + +void usage() +/* Explain usage and exit. */ +{ +errAbort( + "dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated track files\n" + "usage:\n" + " dbSnpJsonToTab assemblyList refSeqToUcsc refsnp-chrN.json[.gz] outRoot\n" + "assemblyList is a comma-separated set of assembly_name values in JSON, e.g. 'GRCh37.p13,GRCh38.p12'\n" + "refSeqToUcsc is tab-sep, 4 columns: RefSeq acc (NC_...), assembly_name, twoBitFile, ucscName (chr...)\n" + "Track files starting with outRoot will be created:\n" + " outRoot.<assembly>.bigDbSnp: BED4+ for main dbSNP track in each assembly in assemblyList\n" + " (final columns need to be added later by bedJoinTabOffset)\n" + " outRootDetails.tab: tab-separated allele frequency counts and other details (for bedJoinTabOffset)\n" + "Additional files are written out for error reporting:\n" + " outRootFailed.json: lines of JSON input that encountered an error, for debugging/reprocessing\n" + " outRootMerged.tab: two columns, obsolete dbSNP rs# ID and current rs# ID\n" + " outRootErrors.tab: lines of error info\n" + "\n" + "\n" + "options:\n" + " -freqSourceOrder=LIST LIST is an ordered, comma-sep list of project names (no spaces)\n" + " for frequency data submitted to dbSNP, e.g. '1000genomes,GnomAD'\n" + " If given, then the bigDbSnp minorAlleleFreq array uses the order.\n" + " Otherwise minorAlleleFreq contains only one frequency, from the\n" + " project with the highest sample count reported for that variant.\n" + " -equivRegions=FILE Each line of FILE contains space-separated acc:start:end regions\n" + " that are equivalent, for example PAR1 or PAR2 on X and Y,\n" + " or an alt sequence and the corresponding portion of its chrom.\n" + " These use RefSeq accesions not UCSC chr names, so that multiple\n" + " assemblies can be described in the same file.\n" + ); +} + +// Globals +int freqSourceCount = 0; +char **freqSourceOrder = NULL; +struct hash *seqIsRc = NULL; +struct hash *ncGrcToChrom = NULL; +struct hash *ncToTwoBitChrom = NULL; +struct hash *ncToSeqWin = NULL; + +/* Command line validation table. */ +static struct optionSpec options[] = { + {"freqSourceOrder", OPTION_STRING}, + {"equivRegions", OPTION_STRING}, + {NULL, 0}, +}; + +struct outStreams +/* Container for various output files that are not assembly-specific */ + { + FILE *details; // outRootDetails.tab + FILE *failJson; // outRootFailed.json + FILE *merged; // outRootMerged.tab + FILE *err; // outRootErrors.tab + }; + +static FILE *openOutFile(char *format, char *outRoot) +/* Open a file for writing, named by format (which must have exactly one %s) and outRoot. */ +{ +struct dyString *name = dyStringCreate(format, outRoot); +FILE *outF = mustOpen(name->string, "w"); +dyStringFree(&name); +return outF; +} + +static struct outStreams *outStreamsOpen(char *outRoot) +/* Open output file handles */ +{ +struct outStreams *outStreams; +AllocVar(outStreams); +outStreams->details = openOutFile("%sDetails.tab", outRoot); +outStreams->failJson = openOutFile("%sFailed.json", outRoot); +outStreams->merged = openOutFile("%sMerged.tab", outRoot); +outStreams->err = openOutFile("%sErrors.tab", outRoot); +return outStreams; +} + +static void outStreamsClose(struct outStreams **pOutStreams) +/* Close file handles and free struct. */ +{ +if (*pOutStreams != NULL) + { + struct outStreams *outStreams = *pOutStreams; + carefulClose(&outStreams->details); + carefulClose(&outStreams->failJson); + carefulClose(&outStreams->merged); + carefulClose(&outStreams->err); + freez(pOutStreams); + } +} + +struct spdiBed +/* dbSNP 2.0 variant representation: {Sequence, Position, Deletion, Insertion}, expanded to + * cover ambiguous placement region for indels when applicable. Add chromEnd to get bed3+. */ +{ + struct spdiBed *next; + char *chrom; // Sequence ID + uint chromStart; // Position: 0-based start + uint chromEnd; // Position + number of deleted bases + char *del; // Deleted sequence (starting at pos) -- or number of deleted bases + char *ins; // Inserted sequence +}; + +struct alObs +// Unit of variant allele frequency observation + { + struct alObs *next; + char *allele; // Variant allele sequence + int obsCount; // Number of chromosomes on which allele was observed by a project + int totalCount; // Number of chromosomes on which project observed some allele of this variant + }; + +struct sharedProps +/* Properties of a refsnp that are shared by all mappings to all assemblies. */ + { + // Extracted from JSON: + char *name; // rs# ID + int freqSourceCount; // Number of projects reporting allele frequency observations + struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs + struct alObs **freqSourceObs; // Array of per-project observation lists + char **freqSourceMajorAl; // Array of per-project major allele + char **freqSourceMinorAl; // Array of per-project minor allele + double *freqSourceMaf; // Array of per-project minor allele frequencies + int biggestSourceIx; // index of freqSourceObs with highest totalCount + boolean *freqIsRc; // Per-project flag for sequence isRc relative to ptlp + struct slInt *soTerms; // Sequence Ontology functional effect terms + struct slName *submitters; // dbSNP submitter handles + struct slInt *pubMedIds; // PubMed article citations + struct slName *clinVarAccs; // Accessions of ClinVar annotations, i.e. ClinVar RCV* + struct slName *clinVarSigs; // Reported ClinVar significance + enum bigDbSnpClass class; // {snv, mnv, ins, del, delins, identity} + enum soTerm maxFuncImpact; // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic} + int commonCount; // Number of projects reporting minor allele freq >= 1% + int rareCount; // Number of projects reporting minor allele freq < 1% + }; + +struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm) +/* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm. + * Checking validity of alleles is up to caller, but this does check whether del is a number + * instead of sequence. */ +{ +struct spdiBed *spdi; +lmAllocVar(lm, spdi); +spdi->chrom = seq; +spdi->chromStart = pos; +spdi->del = del; +spdi->ins = ins; +if (isAllDigits(del)) + spdi->chromEnd = pos + atoi(del); +else + spdi->chromEnd = spdi->chromStart + strlen(del); +return spdi; +} + +static boolean seqIsNucleotide(const char *seq) +/* Return TRUE unless seq contains a character that is not in the IUPAC nucleotide set. */ +{ +if (seq == NULL) + return FALSE; +const char *p; +for (p = seq; *p != '\0'; p++) + { + if (!isIupac(*p)) + return FALSE; + } +return TRUE; +} + +static struct spdiBed *parseSpdis(struct slRef *spdiList, char *name, struct lm *lm) +/* Make sure that all SPDI objects have the consistent seq_id, deleted_sequence and position. */ +{ +struct spdiBed *spdiBList = NULL; +char *firstSeq = NULL; +int firstPos = -1; +char *firstDel = NULL; +struct slRef *spdiRef; +for (spdiRef = spdiList; spdiRef != NULL; spdiRef = spdiRef->next) + { + struct jsonElement *spdiEl = spdiRef->val; + char *seq = jsonQueryString(spdiEl, "spdi", "seq_id", lm); + int pos = jsonQueryInt(spdiEl, "spdi", "position", -1, lm); + char *del = jsonQueryString(spdiEl, "spdi", "deleted_sequence", lm); + char *ins = jsonQueryString(spdiEl, "spdi", "inserted_sequence", lm); + if (seq == NULL) + errAbort("Missing or null seq_id in spdi for %s", name); + if (pos < 0) + errAbort("Invalid or missing position in spdi for %s", name); + if (del == NULL) + errAbort("Missing or null deleted_sequence in spdi for %s", name); + if (ins == NULL) + errAbort("Missing or null inserted_sequence in spdi for %s", name); + if (! seqIsNucleotide(del)) + errAbort("Invalid deleted_sequence '%s' in spdi for %s", del, name); + if (! seqIsNucleotide(ins)) + errAbort("Invalid inserted_sequence '%s' in spdi for %s", ins, name); + if (firstSeq == NULL) + { + // First spdi + firstSeq = seq; + firstDel = del; + firstPos = pos; + } + else + { + if (differentString(firstSeq, seq)) + errAbort("Mismatching seq_id ('%s' vs '%s') in spdis for %s", + firstSeq, seq, name); + if (differentString(firstDel, del)) + errAbort("Mismatching deleted sequence ('%s' vs '%s') in %s spdis for %s", + firstDel, del, seq, name); + if (firstPos != pos) + errAbort("Mismatching position (%d vs %d) in %s spdis for %s", + firstPos, pos, seq, name); + } + slAddHead(&spdiBList, spdiBedNewLm(seq, pos, del, ins, lm)); + } +slReverse(&spdiBList); +return spdiBList; +} + +static enum bigDbSnpClass varTypeToClass(char *varType) +/* Convert JSON variant_type to bigDbSnp enum form. */ +{ +if (sameString(varType, "snv")) + return bigDbSnpSnv; +else if (sameString(varType, "mnv")) + return bigDbSnpMnv; +else if (sameString(varType, "ins")) + return bigDbSnpIns; +else if (sameString(varType, "del")) + return bigDbSnpDel; +else if (sameString(varType, "delins")) + return bigDbSnpDelins; +else if (sameString(varType, "identity")) + return bigDbSnpIdentity; +else + errAbort("varTypeToClass: unrecognized variant_type '%s'.", varType); +return bigDbSnpSnv; // we never get here; appease gcc +} + +static int freqSourceToIx(char *source, struct slName **pSourceList, struct lm *lm) +/* Convert source name to index using freqSourceOrder if defined, otherwise sourceList + * (expand sourceList if source is not already in the list). */ +{ +int ix = -1; +if (freqSourceOrder) + { + ix = stringArrayIx(source, freqSourceOrder, freqSourceCount); + if (ix < 0) + errAbort("freqSourceOrder does not contains '%s'", source); + } +else + { + ix = slNameFindIx(*pSourceList, source); + if (ix < 0) + { + ix = slCount(*pSourceList); + slAddTail(pSourceList, lmSlName(lm, source)); + } + } +return ix; +} + +static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource, + struct spdiBed **retSpdiB, struct lm *lm) +/* Parse one allele observation. */ +{ +struct alObs *obs = NULL; +lmAllocVar(lm, obs); +char *source = jsonQueryString(obsEl, "alObs", "study_name", lm); +if (source == NULL) + errAbort("parseAlObs: Missing or null study_name"); +struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm); +struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm); +obs->allele = spdiB->ins; +obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm); +obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm); +if (obs->obsCount < 0) + errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)", + source, obs->allele); +if (obs->totalCount < 0) + errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)", + source, obs->allele); +if (obs->totalCount == 0) + errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)", + source, obs->allele); +*retSource = source; +*retSpdiB = spdiB; +return obs; +} + +static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm) +/* If the reference allele was omitted from frequency reports (dbSNP JIRA VR-178), infer the count + * and add the reference allele to lists of spdis and obs. */ +{ +int ix; +for (ix = 0; ix < props->freqSourceCount; ix++) + { + if (props->freqSourceSpdis[ix] == NULL) + continue; + boolean gotRef = FALSE; + int sumCounts = 0; + struct spdiBed *spdiB; + struct alObs *obs; + for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix]; + spdiB != NULL && obs != NULL; + spdiB = spdiB->next, obs = obs->next) + { + if (sameString(spdiB->del, spdiB->ins)) + { + gotRef = TRUE; + break; + } + sumCounts += obs->obsCount; + } + if (!gotRef) + { + struct spdiBed *firstSpdiB = props->freqSourceSpdis[ix]; + struct alObs *firstObs = props->freqSourceObs[ix]; + if (sumCounts > firstObs->totalCount) + errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref " + "counts %d", rsId, ix, firstObs->totalCount, sumCounts); + struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart, + firstSpdiB->del, firstSpdiB->del, lm); + struct alObs *obs; + lmAllocVar(lm, obs); + obs->allele = spdiB->del; + obs->obsCount = firstObs->totalCount - sumCounts; + obs->totalCount = firstObs->totalCount; + slAddHead(&props->freqSourceSpdis[ix], spdiB); + slAddHead(&props->freqSourceObs[ix], obs); + } + } +} + +static void checkFreqSourceObs(struct sharedProps *props, char *rsId) +/* After frequency allele observations have been sorted into per-project lists, + * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent. + * The sum of obsCounts may be less than total_count (no-calls), but not greater. */ +{ +int ix; +for (ix = 0; ix < props->freqSourceCount; ix++) + { + int totalCount = 0; + struct alObs *obsList = props->freqSourceObs[ix], *obs; + for (obs = obsList; obs != NULL; obs = obs->next) + { + if (! seqIsNucleotide(obs->allele)) + errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId); + if (obs->obsCount < 0) + errAbort("checkFreqSourceObs: invalid obsCount %d for %s", obs->obsCount, rsId); + if (obs->obsCount > obs->totalCount) + errAbort("checkFreqSourceObs: obsCount %d > totalCount %d for %s", + obs->obsCount, obs->totalCount, rsId); + totalCount += obs->obsCount; + } + for (obs = obsList; obs != NULL; obs = obs->next) + { + if (obs->totalCount < totalCount) + errAbort("checkFreqSourceObs: obsCounts sum to %d, but total_count is %d for %s", + totalCount, obs->totalCount, rsId); + else if (obs->totalCount != obsList->totalCount) + errAbort("checkFreqSourceObs: inconsistent totalCount (%d for '%s' != %d for '%s' " + "for %s", obs->totalCount, obs->allele, obsList->totalCount, obsList->allele, + rsId); + } + } +} + +static void parseFrequencies(struct slRef *annotationsRef, struct sharedProps *props, + char *rsId, struct lm *lm) +/* Extract per-project allele frequency count data from JSON. Note: the alleles are from the + * reference assembly that the project used; current reference might be different. */ +{ +struct slRef *obsList = jsonQueryElementList(annotationsRef, "annotations", "frequency[*]", lm); +if (obsList != NULL) + { + if (freqSourceOrder) + { + props->freqSourceCount = freqSourceCount; + lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount); + lmAllocArray(lm, props->freqSourceObs, freqSourceCount); + lmAllocArray(lm, props->freqIsRc, freqSourceCount); + } + else + { + props->freqSourceCount = 0; + int allocSize = slCount(obsList); + lmAllocArray(lm, props->freqSourceSpdis, allocSize); + lmAllocArray(lm, props->freqSourceObs, allocSize); + lmAllocArray(lm, props->freqIsRc, allocSize); + } + struct slName *unorderedSourceList = NULL; + int maxSampleCount = 0; + int biggestSourceIx = 0; + struct slRef *obsRef; + for (obsRef = obsList; obsRef != NULL; obsRef = obsRef->next) + { + struct jsonElement *obsEl = obsRef->val; + char *source = NULL; + struct spdiBed *spdiB = NULL; + struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm); + int ix = freqSourceToIx(source, &unorderedSourceList, lm); + slAddHead(&props->freqSourceSpdis[ix], spdiB); + slAddHead(&props->freqSourceObs[ix], obs); + props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1); + if (props->freqIsRc[ix] < 0) + errAbort("parseFrequencies: Unknown orientation for %s seq_id %s", + rsId, spdiB->chrom); + if (obs->totalCount > maxSampleCount) + { + maxSampleCount = obs->totalCount; + biggestSourceIx = ix; + } + } + if (!freqSourceOrder) + props->freqSourceCount = slCount(unorderedSourceList); + int ix; + for (ix = 0; ix < props->freqSourceCount; ix++) + { + slReverse(&props->freqSourceSpdis[ix]); + slReverse(&props->freqSourceObs[ix]); + } + addMissingRefAllele(props, rsId, lm); + checkFreqSourceObs(props, rsId); + props->biggestSourceIx = biggestSourceIx; + } +} + +static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm) +/* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional + * impact, discard duplicates and return as slInt list with highest impact first. */ +{ +struct slInt *intList = NULL; +if (soTermNames) + { + int termCount = slCount(soTermNames); + enum soTerm termArray[termCount]; + struct slName *term; + int i; + for (i = 0, term = soTermNames; term != NULL; i++, term = term->next) + termArray[i] = soTermStringIdToId(term->name); + assert(i == termCount); + qsort(termArray, termCount, sizeof termArray[0], soTermCmp); + for (i = 0; i < termCount; i++) + { + if (i == 0 || termArray[i] != termArray[i-1]) + { + struct slInt *intEl; + lmAllocVar(lm, intEl); + intEl->val = termArray[i]; + slAddHead(&intList, intEl); + } + } + slReverse(&intList); + } +return intList; +} + +struct seqWindow *getChromSeq(struct hash *ncToSeqWin, char *nc, struct hash *ncToTwoBitChrom) +/* Look up nc in ncToSeqWin. If not found, look it up in ncToTwoBitChrom, make a seqWin, + * store it in ncToSeqWin and return it. Return NULL if neither hash has nc. */ +{ +struct seqWindow *seqWin = hashFindVal(ncToSeqWin, nc); +if (seqWin == NULL) + { + char *twoBitChrom = hashFindVal(ncToTwoBitChrom, nc); + if (twoBitChrom != NULL) + { + struct dnaSeq *dnaSeq = twoBitLoadAll(twoBitChrom); + if (dnaSeq != NULL) + { + // Use memSeqWindow instead of twoBitSeqWindow to avoid keeping a bunch of open + // twoBitFile handles. + // Hmm, a dnaSeqWindow would be more efficient, no cloning and strlen... + // or at the least, memSeqWindowNew could take a size arg. + seqWin = memSeqWindowNew(twoBitChrom, dnaSeq->dna); + dnaSeqFree(&dnaSeq); + hashAdd(ncToSeqWin, nc, seqWin); + } + } + } +return seqWin; +} + +static boolean maybeExpandRange(struct spdiBed *spdiBTrim, struct seqWindow *seqWin, + struct spdiBed *spdiB, struct alObs *obs, struct lm *lm) +/* If spdiBTrim is a pure insertion or deletion within a repetitive region, + * then expand spdiB (and obs if non-NULL) to the full repetitive range and return TRUE. + * Trim spdiBTrim to minimal representation (see maybeTrimSpdi) before calling this. */ +{ +boolean changed = FALSE; +uint refTrimStart = spdiBTrim->chromStart; +uint refTrimEnd = spdiBTrim->chromEnd; +char *altTrim = spdiBTrim->ins; +int refTrimLen = refTrimEnd - refTrimStart; +int altTrimLen = strlen(altTrim); +if (indelShiftIsApplicable(refTrimLen, altTrimLen)) + { + // Now shift the minimal representation left and right and compare with spdiB coords. + uint leftStart = refTrimStart, leftEnd = refTrimEnd; + char leftAlt[altTrimLen+1]; + safecpy(leftAlt, sizeof leftAlt, altTrim); + int leftShifted = indelShift(seqWin, &leftStart, &leftEnd, leftAlt, + INDEL_SHIFT_NO_MAX, isdLeft); + uint rightStart = refTrimStart, rightEnd = refTrimEnd; + char rightAlt[altTrimLen+1]; + safecpy(rightAlt, sizeof rightAlt, altTrim); + int rightShifted = indelShift(seqWin, &rightStart, &rightEnd, rightAlt, + INDEL_SHIFT_NO_MAX, isdRight); + if (leftShifted || rightShifted) + { + changed = TRUE; + // Expand spdiB->del to cover the whole range., + int newRefLen = rightEnd - leftStart; + spdiB->del = lmAlloc(lm, newRefLen+1); + seqWindowCopy(seqWin, leftStart, newRefLen, spdiB->del, newRefLen+1); + // Concatenate ref bases to the left, altTrim, ref bases to the right -> new alt + char refLeft[leftShifted+1]; + seqWindowCopy(seqWin, leftStart, leftShifted, refLeft, sizeof refLeft); + char refRight[rightShifted+1]; + seqWindowCopy(seqWin, refTrimEnd, rightShifted, refRight, sizeof refRight); + int newAltLen = leftShifted + altTrimLen + rightShifted; + spdiB->ins = lmAlloc(lm, newAltLen+1); + safef(spdiB->ins, newAltLen+1, "%s%s%s", refLeft, altTrim, refRight); + if (obs) + obs->allele = spdiB->ins; + spdiB->chromStart = leftStart; + spdiB->chromEnd = rightEnd; + } + } +return changed; +} + +static void checkSpdisAfterExpansion(struct spdiBed *spdiBList, struct alObs *obsList) +/* Check spdiBList and obsList for inconsistencies; if the reference allele spdiB+obs missed out + * on the expansion then expand them. */ +{ +boolean alreadyExpandedRefAllele = FALSE; +struct spdiBed *spdiB; +struct alObs *obs; +for (spdiB = spdiBList, obs = obsList; + spdiB != NULL; + spdiB = spdiB->next, obs = obs->next) + { + if (strlen(spdiB->del) != spdiB->chromEnd - spdiB->chromStart) + errAbort("Inconsistent freq spdi coords & del: %s|%d|%d %s/%s", + spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, spdiB->del, spdiB->ins); + if (spdiB != spdiBList) + { + if (differentString(spdiB->chrom, spdiBList->chrom)) + errAbort("Differing freq spdiB chrom: '%s' vs. '%s'", + spdiB->chrom, spdiBList->chrom); + if (differentString(spdiB->del, spdiBList->del)) + { + // Ref allele can't be shifted, so expand it to alt's shifted range... but only once, + // so we'll catch differences between alt alleles' expanded ranges. + struct spdiBed *spdiBRef = NULL, *spdiBAlt = NULL; + struct alObs *obsRef = NULL; + if (sameString(spdiBList->del, spdiBList->ins)) + { + spdiBRef = spdiBList; + spdiBAlt = spdiB; + obsRef = obsList; + } + else if (sameString(spdiB->del, spdiB->ins)) + { + spdiBRef = spdiB; + spdiBAlt = spdiBList; + obsRef = obs; + } + if (spdiBRef != NULL && !alreadyExpandedRefAllele) + { + // Copy spdiBAlt's expanded coords and del into spdiBRef + spdiBRef->chromStart = spdiBAlt->chromStart; + spdiBRef->chromEnd = spdiBAlt->chromEnd; + spdiBRef->del = spdiBAlt->del; + // The reference allele's ins is the same as its del, and the corresponding + // obs->allele must match it. + spdiBRef->ins = spdiBRef->del; + obsRef->allele = spdiBRef->ins; + alreadyExpandedRefAllele = TRUE; + } + else + errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s", + spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd, + spdiBList->del, spdiBList->ins, + spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, + spdiB->del, spdiB->ins); + } + } + } +} + +static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm) +/* Return a new spdiBed that is the minimal representation of spdiB. */ +{ +struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart, + lmCloneString(lm, spdiB->del), + lmCloneString(lm, spdiB->ins), lm); +int refTrimLen = 0, altTrimLen = 0; +trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd, + &refTrimLen, &altTrimLen); +return spdiBTrim; +} + +static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom, + struct hash *ncToSeqWin, struct lm *lm) +/* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do + * not cover the entire repetitive range. Expand variant alleles on the sequence on which they + * were originally reported; if there is an indel difference between the reporting assembly and + * the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt + * may be different for different assemblies). */ +{ +int sIx; +for (sIx = 0; sIx < props->freqSourceCount; sIx++) + { + struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx]; + struct alObs *obs, *obsList = props->freqSourceObs[sIx]; + assert(slCount(spdiBList) == slCount(obsList)); + boolean changed = FALSE; + // For each allele, see if it can be slid around. + for (spdiB = spdiBList, obs = obsList; + spdiB != NULL; + spdiB = spdiB->next, obs = obs->next) + { + assert(sameString(obs->allele, spdiB->ins)); + // If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly. + if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) || + stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del)) + { + // First trim ref and alt to their minimal version for indelShift to work with. + // There may be nothing to trim because they're usually minimal already and + // stringIn catches false positive like A/TAAC, and that's fine. + struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm); + // If possible, expand the minimal representation to full SPDI range. + struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom); + if (seqWin == NULL) + errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom); + changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm); + } + } + if (changed) + checkSpdisAfterExpansion(spdiBList, obsList); + } +} + +static double mafFromSource(struct sharedProps *props, int sourceIx, + char **retMajorAllele, char **retMinorAllele, struct lm *lm) +/* Compare allele counts from source and return minor allele frequency (2nd-highest). + * Set *retMajorAllele to allele with highest count (for comparison with mapped ref alleles). + * If there are no observations from source, return NAN (divide by zero). */ +{ +char *majorAllele = NULL, *minorAllele = NULL; +int majorAlleleCount = -1, minorAlleleCount = -1; +double totalCount = 0; +struct alObs *obs; +for (obs = props->freqSourceObs[sourceIx]; obs != NULL; obs = obs->next) + { + if (obs->obsCount > majorAlleleCount) + { + minorAllele = majorAllele; + minorAlleleCount = majorAlleleCount; + majorAlleleCount = obs->obsCount; + majorAllele = obs->allele; + } + else if (obs->obsCount > minorAlleleCount) + { + minorAllele = obs->allele; + minorAlleleCount = obs->obsCount; + } + totalCount = obs->totalCount; + } +if (totalCount > 0) + { + if (majorAlleleCount < 0) + errAbort("no major allele observed despite nonzero totalCount...?"); + if (minorAlleleCount < 0) + errAbort("no minor allele observed... what should the minorAllele value be??"); + } +*retMajorAllele = lmCloneString(lm, majorAllele); +*retMinorAllele = lmCloneString(lm, minorAllele); +// If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test) +return (double)minorAlleleCount / totalCount; +} + +static void sortFrequencies(struct sharedProps *props, struct lm *lm) +/* Determine major/minor allele and per-project MAF (if given). + * If -freqOrder option was given, then fill in props->freqSource* arrays in that order. + * Otherwise, choose the MAF from the project with the largest sample count as a fallback. */ +{ +if (props->freqSourceCount > 0) + { + if (freqSourceOrder != NULL) + { + lmAllocArray(lm, props->freqSourceMajorAl, freqSourceCount); + lmAllocArray(lm, props->freqSourceMinorAl, freqSourceCount); + lmAllocArray(lm, props->freqSourceMaf, freqSourceCount); + int sIx; + for (sIx = 0; sIx < freqSourceCount; sIx++) + { + props->freqSourceMaf[sIx] = mafFromSource(props, sIx, + &props->freqSourceMajorAl[sIx], + &props->freqSourceMinorAl[sIx], lm); + if (isfinite(props->freqSourceMaf[sIx])) + { + if (props->freqSourceMaf[sIx] >= 0.01) + props->commonCount++; + else + props->rareCount++; + } + } + } + else + { + // No freqSourceOrder -- just use source with biggest totalCount for this variant. + props->freqSourceCount = 1; + lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount); + lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount); + lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount); + props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx, + &props->freqSourceMajorAl[0], + &props->freqSourceMinorAl[0], lm); + if (isfinite(props->freqSourceMaf[0])) + { + if (props->freqSourceMaf[0] >= 0.01) + props->commonCount++; + else + props->rareCount++; + } + } + } +} + +static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm) +/* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */ +{ +struct sharedProps *props = NULL; +lmAllocVar(lm, props); +char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm); +char rsName[strlen(rsNumber)+3]; +safef(rsName, sizeof rsName, "rs%s", rsNumber); +props->name = lmCloneString(lm, rsName); +char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm); +props->class = varTypeToClass(varType); +struct slRef *annotationsRef = jsonQueryElement(top, "top", + "primary_snapshot_data.allele_annotations[*]", lm); +parseFrequencies(annotationsRef, props, rsName, lm); +spdiNormalizeFreq(props, ncToTwoBitChrom, ncToSeqWin, lm); +sortFrequencies(props, lm); +struct slName *soTermNames = jsonQueryStringList(annotationsRef, "annotations", + "assembly_annotation[*].genes[*].rnas[*]" + ".sequence_ontology[*].accession", lm); +soTermNames = slCat(soTermNames, + jsonQueryStringList(annotationsRef, "annotations", + "assembly_annotation[*].genes[*].rnas[*]" + ".protein.sequence_ontology[*].accession", lm)); +props->soTerms = soTermStringIdToIdList(soTermNames, lm); +props->maxFuncImpact = props->soTerms ? props->soTerms->val : soUnknown; +props->submitters = jsonQueryStrings(top, "top", + "primary_snapshot_data.support[id.type=subsnp].submitter_handle", + lm); +slUniqify(&props->submitters, slNameCmp, NULL); +props->pubMedIds = jsonQueryInts(top, "top", "citations[*]", lm); +props->clinVarAccs = jsonQueryStringList(annotationsRef, "annotations", + "clinical[*].accession_version", lm); +props->clinVarSigs = jsonQueryStringList(annotationsRef, "annotations", + "clinical[*].clinical_significances[*]", lm); +return props; +} + +static void setBdsFreqData(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds, + struct dyString *dyUcscNotes, struct lm *lm) +/* Use isRc, props to fill in bds->majorAllele, ->minorAllele, determine whether ref is maj/min. + * Add notes to dyUcscNotes if applicable. */ +{ +if (props->freqSourceCount > 0) + { + bds->freqSourceCount = props->freqSourceCount; + lmAllocArray(lm, bds->majorAllele, props->freqSourceCount); + lmAllocArray(lm, bds->minorAllele, props->freqSourceCount); + lmAllocArray(lm, bds->minorAlleleFreq, props->freqSourceCount); + char *firstMajorAllele = NULL; + int sIx; + for (sIx = 0; sIx < props->freqSourceCount; sIx++) + { + boolean freqIsRc = props->freqIsRc[sIx]; + bds->minorAlleleFreq[sIx] = props->freqSourceMaf[sIx]; + bds->majorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMajorAl[sIx])); + bds->minorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMinorAl[sIx])); + if (isfinite(bds->minorAlleleFreq[sIx])) + { + if (isRc != freqIsRc) + { + reverseComplement(bds->majorAllele[sIx], strlen(bds->majorAllele[sIx])); + reverseComplement(bds->minorAllele[sIx], strlen(bds->minorAllele[sIx])); + } + char *majorAllele = bds->majorAllele[sIx]; + if (differentString(bds->ref, majorAllele)) + dyStringAppend(dyUcscNotes, bdsRefIsMinor ","); + if (firstMajorAllele == NULL) + firstMajorAllele = majorAllele; + else if (majorAllele != NULL && + differentString(firstMajorAllele, majorAllele)) + dyStringAppend(dyUcscNotes, bdsDiffMajor ","); + } + } + } +} + +static void checkRareRef(struct sharedProps *props, char *ref, boolean isRc, + struct dyString *dyUcscNotes) +/* If frequency counts are reported, test whether the reference has ever been observed, + * and if so whether it is rare, i.e. < 1%. If so, add to dyUcscNotes. */ +{ +double rareThreshold = 0.01; +boolean haveFrequencies = FALSE, refWasObserved = FALSE; +double maxRefAF = 0.0; +int ix; +for (ix = 0; ix < props->freqSourceCount; ix++) + { + if (props->freqSourceObs[ix] != NULL) + haveFrequencies = TRUE; + boolean freqIsRc = props->freqIsRc[ix]; + int refLen = strlen(ref); + char refCpy[refLen+1]; + safecpy(refCpy, sizeof refCpy, ref); + if (isRc != freqIsRc) + reverseComplement(refCpy, refLen); + struct spdiBed *spdiB; + struct alObs *obs; + for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix]; + spdiB != NULL && obs != NULL; + spdiB = spdiB->next, obs = obs->next) + { + if (sameString(refCpy, spdiB->ins)) + { + if (obs->obsCount > 0) + { + refWasObserved = TRUE; + double refAF = (double)obs->obsCount / (double)obs->totalCount; + if (refAF > maxRefAF) + maxRefAF = refAF; + } + break; + } + } + if (maxRefAF >= rareThreshold) + break; + } +if (haveFrequencies && maxRefAF < rareThreshold) + { + dyStringAppend(dyUcscNotes, bdsRefIsRare ","); + if (!refWasObserved) + dyStringAppend(dyUcscNotes, bdsRefIsSingleton ","); + } +} + +static enum bigDbSnpClass bigDbSnpClassFromAlleles(struct bigDbSnp *bds) +/* Return what we expect bigDbSnpClass to be based on allele lengths. */ +{ +enum bigDbSnpClass alleleClass = 0; +if (bds->altCount == 0) + alleleClass = bigDbSnpIdentity; +else + { + int refLen = strlen(bds->ref); + boolean altLenZero = TRUE; + boolean altLenLtRef = FALSE; + boolean altLenEqRef = FALSE; + boolean altLenGtRef = FALSE; + int i; + for (i = 0; i < bds->altCount; i++) + { + int altLen = strlen(bds->alts[i]); + if (altLen < refLen) + altLenLtRef = TRUE; + else if (altLen == refLen) + altLenEqRef = TRUE; + else if (altLen > refLen) + altLenGtRef = TRUE; + if (altLen > 0) + altLenZero = FALSE; + } + if (!altLenLtRef && altLenEqRef && !altLenGtRef) + { + if (refLen == 1) + alleleClass = bigDbSnpSnv; + else + alleleClass = bigDbSnpMnv; + } + else if (bds->altCount == 0) + alleleClass = bigDbSnpIdentity; + else if (refLen == 0) + alleleClass = bigDbSnpIns; + else if (altLenZero) + alleleClass = bigDbSnpDel; + else + alleleClass = bigDbSnpDelins; + } +return alleleClass; +} + +static void checkIndelPlacement(struct bigDbSnp *bds, struct seqWindow *seqWin, + struct lm *lm) +/* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that + * bds's SPDI-based coords cover the whole repetitive region as expected. */ +{ +int i; +for (i = 0; i < bds->altCount; i++) + { + char *alt = bds->alts[i]; + if (isEmpty(bds->ref) || isEmpty(alt) || + stringIn(bds->ref, alt) || stringIn(alt, bds->ref)) + { + // First trim ref and alt to their minimal version for indelShift to work with. + // There may be nothing to trim because they're usually minimal already and + // stringIn catches false positive like A/TAAC, and that's fine. + struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm); + struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm); + + maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm); + if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd) + errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') " + "could be expanded to %s|%d|%d\n", + bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt, + bds->chrom, spdiB->chromStart, spdiB->chromEnd); + } + } +} + +static boolean delMismatchesGenome(struct bigDbSnp *bds, struct seqWindow *seqWin) +/* Return TRUE if bds->ref (spdi del) does not match assembly sequence at bds coords. + * Sometimes the genome has an N and dbSNP has a more specific IUPAC character. + * errAbort if there's some other kind of inconsistency. */ +{ +int refLen = strlen(bds->ref); +uint refStart = bds->chromStart; +uint refEnd = bds->chromEnd; +if (refLen != refEnd - refStart) + errAbort("Inconsistent ref and coords for %s: ref is '%s' (%d bases) " + "but position is %s|%d|%d (%d bases)", + bds->name, bds->ref, refLen, bds->chrom, refStart, refEnd, refEnd - refStart); +char genomeRef[refLen+1]; +seqWindowCopy(seqWin, refStart, refLen, genomeRef, sizeof genomeRef); +if (differentString(genomeRef, bds->ref)) + { + if (strchr(genomeRef, 'N')) + return TRUE; + else + errAbort("bds->ref '%s' mismatches genome sequence '%s'", bds->ref, genomeRef); + } +return FALSE; +} + +static boolean hasAmbiguousFreqAllele(struct sharedProps *props) +/* Return TRUE if any allele from any frequency source contains an IUPAC ambiguous character. */ +{ +int sIx; +for (sIx = 0; sIx < props->freqSourceCount; sIx++) + { + struct alObs *obs; + for (obs = props->freqSourceObs[sIx]; obs != NULL; obs = obs->next) + if (anyIupac(obs->allele)) + return TRUE; + } +return FALSE; +} + +static boolean freqHasNonRefAltAllele(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds) +/* Return TRUE if any allele from any frequency source does not match any ref/alt (SPDI) allele. */ +{ +int sIx; +for (sIx = 0; sIx < props->freqSourceCount; sIx++) + { + boolean freqIsRc = props->freqIsRc[sIx]; + struct alObs *obs; + for (obs = props->freqSourceObs[sIx]; obs != NULL; obs = obs->next) + { + int alLen = strlen(obs->allele); + char alCpy[alLen+1]; + safecpy(alCpy, sizeof alCpy, obs->allele); + if (freqIsRc != isRc) + reverseComplement(alCpy, alLen); + boolean foundIt = FALSE; + if (sameString(alCpy, bds->ref)) + foundIt = TRUE; + else + { + int i; + for (i = 0; i < bds->altCount; i++) + if (sameString(alCpy, bds->alts[i])) + { + foundIt = TRUE; + break; + } + } + if (!foundIt) + return TRUE; + } + } +return FALSE; +} + +static void addUcscNotes(struct sharedProps *props, boolean isRc, boolean isMultiMapper, + struct bigDbSnp *bds, struct seqWindow *seqWin, + struct dyString *dyUcscNotes, struct lm *lm) +/* Record interesting conditions if applicable. */ +{ +if (props->class != bigDbSnpClassFromAlleles(bds)) + dyStringAppend(dyUcscNotes, bdsClassMismatch ","); +if (props->clinVarAccs != NULL) + dyStringAppend(dyUcscNotes, bdsClinvar ","); +if (props->commonCount > 0) + { + dyStringAppend(dyUcscNotes, bdsCommonSome ","); + if (props->rareCount == 0) + dyStringAppend(dyUcscNotes, bdsCommonAll ","); + } +if (isRc) + dyStringAppend(dyUcscNotes, bdsRevStrand ","); +if (isMultiMapper) + dyStringAppend(dyUcscNotes, bdsMultiMap ","); +checkRareRef(props, bds->ref, isRc, dyUcscNotes); +if (delMismatchesGenome(bds, seqWin)) + dyStringAppend(dyUcscNotes, bdsRefMismatch ","); +if (anyIupac(bds->ref)) + dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ","); +int i; +for (i = 0; i < bds->altCount; i++) + if (anyIupac(bds->alts[i])) + dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ","); +if (hasAmbiguousFreqAllele(props)) + dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ","); +if (freqHasNonRefAltAllele(props, isRc, bds)) + dyStringAppend(dyUcscNotes, bdsFreqNotRefAlt ","); +bds->ucscNotes = lmCloneString(lm, dyUcscNotes->string); +} + +static void calcShiftBases(struct bigDbSnp *bds) +/* Return the max length of uncertain placement from all alt alleles. */ +{ +int maxBasesTrimmed = 0; +int refLen = bds->chromEnd - bds->chromStart; +char refTrim[refLen+1]; +int i; +for (i = 0; i < bds->altCount; i++) + { + // Make copies of ref and alt, trim the copies, measure the change. + int refTrimLen = refLen; + safecpy(refTrim, sizeof refTrim, bds->ref); + int altTrimLen = strlen(bds->alts[i]); + char altTrim[altTrimLen+1]; + safecpy(altTrim, sizeof altTrim, bds->alts[i]); + uint start = bds->chromStart, end = bds->chromEnd; + trimRefAlt(refTrim, altTrim, &start, &end, &refTrimLen, &altTrimLen); + int basesTrimmed = refLen - refTrimLen; + if (basesTrimmed > maxBasesTrimmed) + maxBasesTrimmed = basesTrimmed; + if (maxBasesTrimmed >= refLen) + break; + } +bds->shiftBases = min(refLen, maxBasesTrimmed); +} + +static struct bigDbSnp *parsePlacement(struct jsonElement *pl, struct sharedProps *props, + boolean isMultiMapper, struct dyString *dy, struct lm *lm) +/* Return a bigDbSnp with information about one placement (mapping). */ +{ +struct bigDbSnp *bds; +lmAllocVar(lm, bds); +struct slRef *allSpdiRef = jsonQueryElement(pl, "placement", "alleles[*].allele.spdi", lm); +bds->chrom = jsonQueryString(pl, "placement", "seq_id", lm); +if (bds->chrom == NULL) + errAbort("Missing or NULL seq_id for %s", props->name); +struct spdiBed *spdiBList = parseSpdis(allSpdiRef, props->name, lm); +if (!sameString(bds->chrom, spdiBList->chrom)) + errAbort("seq_id mismatch: placement has '%s' but spdi has '%s for %s'", + bds->chrom, spdiBList->chrom, props->name); +bds->chromStart = spdiBList->chromStart; +bds->chromEnd = spdiBList->chromEnd; +bds->name = props->name; +bds->ref = spdiBList->del; +// spdiBList may or may not include no-change allele (del == ins) +lmAllocArray(lm, bds->alts, slCount(spdiBList)); +struct spdiBed *spdiB; +for (spdiB = spdiBList; spdiB != NULL; spdiB = spdiB->next) + { + if (!sameString(bds->ref, spdiB->ins)) + bds->alts[bds->altCount++] = spdiB->ins; + } +calcShiftBases(bds); +boolean isRc = jsonQueryBoolean(pl, "placement", + "placement_annot.is_aln_opposite_orientation", FALSE, lm); +dyStringClear(dy); +setBdsFreqData(props, isRc, bds, dy, lm); +bds->maxFuncImpact = props->maxFuncImpact; +bds->class = props->class; +struct seqWindow *seqWin = getChromSeq(ncToSeqWin, bds->chrom, ncToTwoBitChrom); +checkIndelPlacement(bds, seqWin, lm); +addUcscNotes(props, isRc, isMultiMapper, bds, seqWin, dy, lm); +return bds; +} + +static void equivRegionAdd(struct hash *equivRegions, struct bed3 *region, struct bed3 *regionList) +/* Associate regionList with region by looking up the binKeeper for region->chrom in equivRegions + * and adding regionList to the binKeeper for region start & end. */ +{ +struct binKeeper *bk = hashFindVal(equivRegions, region->chrom); +if (bk == NULL) + { + bk = binKeeperNew(0, BINRANGE_MAXEND_512M); + hashAdd(equivRegions, region->chrom, bk); + } +binKeeperAdd(bk, region->chromStart, region->chromEnd, regionList); +} + +static struct hash *getEquivRegions() +/* If not done already, read -equivRegions file if specified and parse it into data structure + * of equivalent regions for detecting legitimate cases of multiple mappings. */ +{ +static struct hash *equivRegions = NULL; +static boolean alreadyInited = FALSE; +if (!alreadyInited) + { + char *equivRegionsFile = optionVal("equivRegions", NULL); + if (isNotEmpty(equivRegionsFile)) + { + equivRegions = hashNew(0); + struct lineFile *lf = lineFileOpen(equivRegionsFile, TRUE); + char *line; + while (lineFileNextReal(lf, &line)) + { + // Build a list of equivalent regions; hash each region->chrom to list. + char *words[128]; + int wordCount = chopLine(line, words); + if (wordCount >= ArraySize(words)) + lineFileAbort(lf, "getEquivRegions: too many words per line (%d)", wordCount); + struct bed3 *regionList = NULL; + int i; + for (i = 0; i < wordCount; i++) + { + char *chrom = cloneString(words[i]); + char *colon = strchr(chrom, ':'); + if (colon) + { + char *startStr = colon+1; + char *nextColon = strchr(startStr, ':'); + if (! nextColon) + lineFileAbort(lf, "getEquivRegions: expected each word to have " + "0 or 2 colons, but got '%s'", words[i]); + *colon = '\0'; + *nextColon = '\0'; + char *endStr = nextColon+1; + if (!isAllDigits(startStr) || !isAllDigits(endStr)) + lineFileAbort(lf, "getEquivRegions: expected a number following each colon, " + "but got '%s'", words[i]); + int start = atoi(startStr); + int end = atoi(endStr); + slAddHead(®ionList, bed3New(chrom, start, end)); + } + else + slAddHead(®ionList, bed3New(chrom, 0, 0)); + } + // Sort so that a main chromosome goes before alts/fixes. + slSort(®ionList, bedCmp); + // Associate all regions with the same list. we can use pointer equality later. + struct bed3 *region; + for (region = regionList; region != NULL; region = region->next) + equivRegionAdd(equivRegions, region, regionList); + } + lineFileClose(&lf); + } + else + warn("-equivRegions=FILE not specified; will not be able to detect variants " + "that are mapped to multiple non-equivalent regions."); + alreadyInited = TRUE; + } +return equivRegions; +} + +static struct slRef *equivRegionFind(struct hash *equivRegions, struct bed3 *region, + struct lm *lm) +/* Return ref list of equivalent region lists that overlap region. */ +{ +struct binElement *beList = NULL; +struct binKeeper *bk = hashFindVal(equivRegions, region->chrom); +if (bk) + { + int start = region->chromStart; + int end = region->chromEnd; + // binKeeperFind can't find overlap with 0-length insertions (start == end) so expand those: + if (start == end) + { + start--; + end++; + } + beList = binKeeperFind(bk, start, end); + } +struct slRef *refList = NULL; +struct binElement *binEl; +for (binEl = beList; binEl != NULL; binEl = binEl->next) + lmRefAdd(lm, &refList, binEl->val); +slReverse(&refList); +slFreeList(&beList); +return refList; +} + +static struct bed3 *bed3FromPlRef(struct slRef *plRef, struct lm *lm) +/* Extract a bed3 from the first SPDI found in the first placement in plRef. */ +{ +struct jsonElement *pl = plRef->val; +struct slRef *spdiRef = jsonQueryElement(pl, "placement", "alleles[0].allele.spdi", lm); +struct jsonElement *spdiEl = spdiRef->val; +struct bed3 *region; +lmAllocVar(lm, region); +region->chrom = jsonQueryString(spdiEl, "first spdi", "seq_id", lm); +region->chromStart = jsonQueryInt(spdiEl, "first spdi", "position", -1, lm); +char *ref = jsonQueryString(spdiEl, "first spdi", "deleted_sequence", lm); +region->chromEnd = region->chromStart + strlen(ref); +return region; +} + +static struct bed3 *bed3ListFromPlRef(struct slRef *plRefList, struct lm *lm) +/* Return a list of bed3 derived from placement SPDIs in plRef. */ +{ +struct bed3 *list = NULL; +struct slRef *plRef; +for (plRef = plRefList; plRef != NULL; plRef = plRef->next) + slAddHead(&list, bed3FromPlRef(plRef, lm)); +slReverse(&list); +return list; +} + +static boolean uniqChroms(struct bed3 *bedList) +/* Return TRUE if each item of *sorted* bedList has a different chrom. */ +{ +boolean uniq = TRUE; +if (bedList != NULL) + { + struct bed3 *bed; + for (bed = bedList; bed->next != NULL; bed = bed->next) + if (sameString(bed->chrom, bed->next->chrom)) + { + uniq = FALSE; + break; + } + } +return uniq; +} + +static boolean allMappingsHaveMatch(struct slRef *equivRegionsA[], struct slRef *equivRegionsB[], + int plCount) +// Return true if each placement's list of equivRegion sets in A has at least one set that +// matches at least one set from some other placement's list in B. A and B may/may not be the same. +{ +int i; +for (i = 0; i < plCount; i++) + { + struct slRef *plEquivList = equivRegionsA[i]; + boolean foundMatch = FALSE; + int j; + for (j = 0; j < plCount; j++) + { + if (j == i) + continue; + struct slRef *otherEquivList = equivRegionsB[j]; + struct slRef *plEquiv; + for (plEquiv = plEquivList; plEquiv != NULL; plEquiv = plEquiv->next) + { + struct slRef *otherEquiv; + for (otherEquiv = otherEquivList; otherEquiv; otherEquiv = otherEquiv->next) + { + // We can use pointer equality because getEquivRegions assigns same list + // pointer to each region in list. + if (plEquiv->val == otherEquiv->val) + { + foundMatch = TRUE; + break; + } + } + } + } + if (! foundMatch) + return FALSE; + } +return TRUE; +} + +static boolean detectMultiMappers(struct slRef *placements, struct lm *lm) +/* Return TRUE if placements map to more than one genomic locus, aside from legitimate cases + * like the PAR regions on X and Y, or alt/fix sequences and their corresponding chromosomal + * locations. */ +{ +struct hash *equivRegions = getEquivRegions(); +int plCount = slCount(placements); +if (equivRegions && plCount > 1) + { + // Get a bed3 list of placements and make sure there aren't multiple mappings on the same chrom. + struct bed3 *bedList = bed3ListFromPlRef(placements, lm); + slSort(&bedList, bedCmp); + if (!uniqChroms(bedList)) + return TRUE; + // Gather up equivRegions lists for all placements. If any placement has no equivRegions, + // then we're done. + struct slRef *plEquivRegions[plCount]; + struct bed3 *bed; + int i; + for (i = 0, bed = bedList; bed != NULL; bed = bed->next, i++) + { + struct slRef *listOfLists = equivRegionFind(equivRegions, bed, lm); + if (listOfLists == NULL) + return TRUE; + plEquivRegions[i] = listOfLists; + } + if (!allMappingsHaveMatch(plEquivRegions, plEquivRegions, plCount)) + { + // Sometimes the test for equivalent regions fails because all regions point to a + // common location -- but that location does not have a mapping, so its suite of + // regions are not in the mix. For example, a variant is mapped to 4 alts of the + // same chrom -- but not to the chrom itself, so all regions point to the main chrom, + // but none of the 4 mappings' regions are in common with each other. + // So try harder before giving up: add equivalent regions that overlap original equivs. + struct slRef *plExpandedEquivs[plCount]; + for (i = 0, bed = bedList; bed != NULL; bed = bed->next, i++) + { + struct slRef *expandedEquivs = NULL; + struct slRef *plEquiv; + for (plEquiv = plEquivRegions[i]; plEquiv != NULL; plEquiv = plEquiv->next) + { + struct slRef *setRef; + for (setRef = plEquiv; setRef != NULL; setRef = setRef->next) + { + struct bed3 *set = setRef->val; + struct bed3 *region; + for (region = set; region != NULL; region = region->next) + { + if (differentString(bed->chrom, region->chrom)) + expandedEquivs = slCat(equivRegionFind(equivRegions, region, lm), + expandedEquivs); + } + } + } + plExpandedEquivs[i] = expandedEquivs; + } + if (!allMappingsHaveMatch(plEquivRegions, plExpandedEquivs, plCount)) + return TRUE; + } + } +return FALSE; +} + +static void dyStringPrintSlNameList(struct dyString *dy, struct slName *list, char *sep) +/* Append list of names with separator to dy; separator appears after every item. Skip empty names. */ +{ +struct slName *item; +for (item = list; item != NULL; item = item->next) + { + if (isNotEmpty(item->name)) + { + dyStringAppend(dy, item->name); + dyStringAppend(dy, sep); + } + } +} + +static void dyStringPrintSlIntList(struct dyString *dy, struct slInt *list, char *sep) +/* Append list of names with separator to dy; separator appears after every item. */ +{ +struct slInt *item; +for (item = list; item != NULL; item = item->next) + { + dyStringPrintf(dy, "%d", item->val); + dyStringAppend(dy, sep); + } +} + +static void appendFrequencies(struct sharedProps *props, struct dyString *dy) +/* Condense the frequency data in props into an autoSql array of compact strings and append to dy. + * Also add autoSql int array of total_count because those may vary between variants. */ +{ +dyStringPrintf(dy, "%d\t", props->freqSourceCount); +int ix; +for (ix = 0; ix < props->freqSourceCount; ix++) + { + boolean freqIsRc = props->freqIsRc[ix]; + struct alObs *obs; + for (obs = props->freqSourceObs[ix]; obs != NULL; obs = obs->next) + { + if (obs != props->freqSourceObs[ix]) + dyStringAppendC(dy, '|'); + if (freqIsRc) + // Store all frequencies on + strand of "preferred top level placement" + reverseComplement(obs->allele, strlen(obs->allele)); + dyStringPrintf(dy, "%s:%d", obs->allele, obs->obsCount); + if (freqIsRc) + // rc again to restore original values + reverseComplement(obs->allele, strlen(obs->allele)); + } + dyStringAppendC(dy, ','); + } +dyStringAppendC(dy, '\t'); +for (ix = 0; ix < props->freqSourceCount; ix++) + { + if (props->freqSourceObs[ix] != NULL) + dyStringPrintf(dy, "%d,", props->freqSourceObs[ix]->totalCount); + else + dyStringAppend(dy, "0,"); + } +} + +static void writeDetails(struct sharedProps *props, struct dyString *dy, FILE *outPropsF) +/* Write props out as tab-sep line to outPropsF. */ +{ +dyStringClear(dy); +dyStringPrintf(dy, "%s\t", props->name); +appendFrequencies(props, dy); +dyStringPrintf(dy, "\t%d\t", slCount(props->soTerms)); +dyStringPrintSlIntList(dy, props->soTerms, ","); +dyStringPrintf(dy, "\t%d\t", slCount(props->clinVarAccs)); +dyStringPrintSlNameList(dy, props->clinVarAccs, ","); +dyStringAppendC(dy, '\t'); +dyStringPrintSlNameList(dy, props->clinVarSigs, ","); +dyStringPrintf(dy, "\t%d\t", slCount(props->submitters)); +dyStringPrintSlNameList(dy, props->submitters, ","); +dyStringPrintf(dy, "\t%d\t", slCount(props->pubMedIds)); +dyStringPrintSlIntList(dy, props->pubMedIds, ","); +dyStringAppendC(dy, '\n'); +fputs(dy->string, outPropsF); +} + +static void dumpLineToFile(char *line, FILE *jsonF) +/* Write line to jsonF, making sure it ends in \n. */ +{ +fputs(line, jsonF); +if (line[strlen(line)-1] != '\n') + fputc('\n', jsonF); +} + +static void updateSeqIsRc(struct slRef *placements, struct lm *lm) +/* Note is_aln_opposite_orientation flag for each mapped sequence, regardless of whether it's + * in the assembly that we're working on; later we can look up sequences from freq reports. */ +{ +hashIntReset(seqIsRc); +struct slRef *plRef; +for (plRef = placements; plRef != NULL; plRef = plRef->next) + { + char *seqId = jsonQueryString(plRef->val, "placement", "seq_id", lm); + boolean isRc = jsonQueryBoolean(plRef->val, "placement", + "placement_annot.is_aln_opposite_orientation", FALSE, lm); + struct hashEl *hel = hashLookup(seqIsRc, seqId); + if (hel) + { + if (isRc != ptToInt(hel->val)) + errAbort("updateSeqIsRc: sequence '%s' has both true and false for " + "placement_annot.is_aln_opposite_orientation", seqId); + } + else + hashAddInt(seqIsRc, seqId, isRc); + } +} + +struct perAssembly +// For each assembly whose mappings we're extracting from JSON, we need the name, +// a jsonQuery path to extract just the mappings for that assembly, and an output file. + { + struct perAssembly *next; + char *name; // assembly_name e.g. GRCh38.p12 + char *placementPath; // jsonQuery path to placements on assembly + char *plIsTopLevelPath; // jsonQuery path to find whether placement is top level in assembly + FILE *outF; // bigDbSnp output file + }; + +static void perAssemblyFree(struct perAssembly **pPa) +/* Close *pPa's outF and free mem. */ +{ +if (pPa) + { + struct perAssembly *pa = *pPa; + freeMem(pa->name); + carefulClose(&pa->outF); + freeMem(pa->placementPath); + freeMem(pa->plIsTopLevelPath); + freez(pPa); + } +} + +static void perAssemblyFreeList(struct perAssembly **pPaList) +/* Close & free a list of perAssembly. */ +{ +if (pPaList) + { + struct perAssembly *el, *next; + for (el = *pPaList; el != NULL; el = next) + { + next = el->next; + perAssemblyFree(&el); + } + } +} + +static void writeMerged(struct jsonElement *top, FILE *outMergedF, struct lm *lm) +/* If any merged IDs are given, write the mappings out to outMergedF. */ +{ +struct slName *mergedIds = jsonQueryStrings(top, "top", "dbsnp1_merges[*].merged_rsid", lm); +if (mergedIds) + { + char *rsId = jsonQueryString(top, "top", "refsnp_id", lm); + struct slName *mergedId; + for (mergedId = mergedIds; mergedId != NULL; mergedId = mergedId->next) + fprintf(outMergedF, "rs%s\trs%s\n", mergedId->name, rsId); + } +} + +static char *getChrom(char *refSeqAcc, char *assembly) +/* Look up "refSeqAcc assembly" in ncGrcToChrom; if found, return chrom, otherwise return + * refSeqAcc for translation later (e.g. NC_012920.1 liftOver to hg19 chrM). Don't free ret. */ +{ +char ncGrc[strlen(refSeqAcc) + 1 + strlen(assembly) + 1]; +safef(ncGrc, sizeof ncGrc, "%s %s", refSeqAcc, assembly); +char *chrom = hashFindVal(ncGrcToChrom, ncGrc); +return chrom ? chrom : refSeqAcc; +} + +static void parseDbSnpJson(char *line, struct perAssembly *assemblyProps, + struct outStreams *outStreams) +/* Each line of dbSNP's file contains one JSON blob describing an rs# variant. Parse JSON, + * extract the bits that we want to keep, print out BED+ and accompanying files. */ +{ +struct lm *lm = lmInit(1<<16); +struct jsonElement *top = jsonParseLm(line, lm); +struct sharedProps *props = NULL; +struct bigDbSnp *bds = NULL; +struct slRef *allPlacements = jsonQueryElement(top, "top", + "primary_snapshot_data.placements_with_allele[*]", + lm); +updateSeqIsRc(allPlacements, lm); +struct dyString *dyScratch = dyStringNew(0); +struct perAssembly *ap; +for (ap = assemblyProps; ap != NULL; ap = ap->next) + { + struct slRef *placements = jsonQueryElement(top, "top", ap->placementPath, lm); + // Filter out scaffolds that are independent in some other assembly, but part of a regular + // chrom in the current assembly + struct slRef *plRef, *plRefNext, *filtered = NULL; + for (plRef = placements; plRef != NULL; plRef = plRefNext) + { + plRefNext = plRef->next; + struct jsonElement *pl = plRef->val; + boolean isTopLevel = jsonQueryBoolean(pl, "placement", ap->plIsTopLevelPath, FALSE, lm); + if (isTopLevel) + slAddHead(&filtered, plRef); + } + slReverse(&filtered); + placements = filtered; + boolean multiMapper = detectMultiMappers(placements, lm); + struct errCatch *errCatch = errCatchNew(); +//#*** NOTE: currently in some cases the alleles are incorrect on alt sequences where the +//#*** alt has a different allele of the same variant (or just different sequence). +//#*** We detect at least some of those by inconsistent pos/del in alt placement spdis, +//#*** but who knows, there could be more. Ultimately we will need to compare assembly +//#*** sequences... unless/until dbSNP fixes their projection onto alts (JIRA VR-176). + if (errCatchStart(errCatch)) + { + for (plRef = placements; plRef != NULL; plRef = plRef->next) + { + struct jsonElement *pl = plRef->val; + if (props == NULL) + { + // Write props only if we have at least one placement. + props = extractProps(top, lm); + writeDetails(props, dyScratch, outStreams->details); + writeMerged(top, outStreams->merged, lm); + } + bds = parsePlacement(pl, props, multiMapper, dyScratch, lm); + bds->chrom = getChrom(bds->chrom, ap->name); + bigDbSnpTabOut(bds, ap->outF); + // Prevent stale bds in errCatch below: + bds = NULL; + } + } + errCatchEnd(errCatch); + if (errCatch->gotError) + { + dumpLineToFile(line, outStreams->failJson); + char *rsId = jsonQueryString(top, "top", "refsnp_id", lm); + char *seqId = bds ? bds->chrom : "NOSEQ"; + fprintf(outStreams->err, "%s\t%s\t%s", rsId, seqId, errCatch->message->string); + } + else if (isNotEmpty(errCatch->message->string)) + warn("%s", errCatch->message->string); + errCatchFree(&errCatch); + } +dyStringFree(&dyScratch); +lmCleanup(&lm); +} + +static struct slName *initFreqSourceOrder() +/* If -freqSourceOrder option was given, extract source names from the comma-sep list + * into globals freqSourceCount and freqSourceOrder[]. */ +{ +char *freqSourceOrderStr = optionVal("freqSourceOrder", NULL); +struct slName *sources = NULL; +if (freqSourceOrderStr) + { + sources = slNameListFromComma(freqSourceOrderStr); + freqSourceCount = slCount(sources); + AllocArray(freqSourceOrder, freqSourceCount); + struct slName *source; + int i; + for (i = 0, source = sources; i < freqSourceCount; i++, source = source->next) + freqSourceOrder[i] = source->name; + } +return sources; +} + +static struct perAssembly *getAssemblyProps(char *assemblyList, char *outRoot) +/* Split comma-sep assemblyList and make placement path and bigDbSnp output file handle for each. */ +{ +struct perAssembly *assemblyProps = NULL; +struct slName *assembly, *assemblies = slNameListFromComma(assemblyList); +for (assembly = assemblies; assembly != NULL; assembly = assembly->next) + { + struct perAssembly *ap; + AllocVar(ap); + ap->name = cloneString(assembly->name); + // Fancy path to get all placements that are on the desired assembly. + struct dyString *dy = dyStringCreate("primary_snapshot_data.placements_with_allele" + "[placement_annot.seq_id_traits_by_assembly[*].assembly_name=%s]", + assembly->name); + ap->placementPath = dyStringCannibalize(&dy); + dy = dyStringCreate("placement_annot.seq_id_traits_by_assembly[assembly_name=%s].is_top_level", + assembly->name); + ap->plIsTopLevelPath = dyStringCannibalize(&dy); + char prefix[2048]; + safef(prefix, sizeof prefix, "%s.%s", outRoot, assembly->name); + ap->outF = openOutFile("%s.bigDbSnp", prefix); + slAddHead(&assemblyProps, ap); + } +return assemblyProps; +} + +static void parseRefSeqToUcsc(char *refSeqToUcsc, struct hash *ncGrcToChrom, + struct hash *ncToTwoBitChrom) +/* Extract mappings of {NC_*, GRC*} to chrom and NC_* to {twoBitFile, chrom}. */ +{ +struct lineFile *lf = lineFileOpen(refSeqToUcsc, TRUE); +char *line; +while (lineFileNextReal(lf, &line)) + { + char *row[5]; + int wordCount = chopTabs(line, row); + lineFileExpectWords(lf, 4, wordCount); + char *acc = row[0]; + char *assembly = row[1]; + char *twoBitFile = row[2]; + char *chrom = row[3]; + if (!regexMatch(acc, "^N[A-Z]_[0-9]+\\.[0-9]+$")) + lineFileAbort(lf, "'%s' does not look like a RefSeq sequence accession " + "(expecting N*_number.number in first column)", acc); + if (!startsWith("GRCh", assembly)) + lineFileAbort(lf, "'%s' does not look like a GRCh* assembly name " + "(expecting GRCh* in second column", assembly); + if (!endsWith(twoBitFile, ".2bit")) + lineFileAbort(lf, "'%s' does not look like a twoBit file " + "(expecting *.2bit in third column)", twoBitFile); + if (!regexMatch(chrom, "^chr[0-9A-Za-z_]+$")) + lineFileAbort(lf, "'%s' does not look like an hg* chrom name " + "(expecting 'chr' + alphanumeric/_ in fourth column)", chrom); + char accAssembly[strlen(acc) + 1 + strlen(assembly) + 1]; + safef(accAssembly, sizeof accAssembly, "%s %s", acc, assembly); + hashAdd(ncGrcToChrom, accAssembly, cloneString(chrom)); + char twoBitChrom[strlen(twoBitFile) + 1 + strlen(chrom) + 1]; + safef(twoBitChrom, sizeof twoBitChrom, "%s:%s", twoBitFile, chrom); + hashAdd(ncToTwoBitChrom, acc, cloneString(twoBitChrom)); + } +lineFileClose(&lf); +} + +void dbSnpJsonToTab(char *assemblyList, char *refSeqToUcsc, char *jsonFile, char *outRoot) +/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */ +{ +struct slName *sources = initFreqSourceOrder(); +struct lineFile *lf = lineFileOpen(jsonFile, TRUE); +struct outStreams *outStreams = outStreamsOpen(outRoot); +struct perAssembly *assemblyProps = getAssemblyProps(assemblyList, outRoot); +seqIsRc = hashNew(0); +ncGrcToChrom = hashNew(0); +ncToTwoBitChrom = hashNew(0); +ncToSeqWin = hashNew(0); +parseRefSeqToUcsc(refSeqToUcsc, ncGrcToChrom, ncToTwoBitChrom); +char *line = NULL; +while (lineFileNext(lf, &line, NULL)) + parseDbSnpJson(line, assemblyProps, outStreams); +lineFileClose(&lf); +perAssemblyFreeList(&assemblyProps); +outStreamsClose(&outStreams); +// Free these up for squeaky-clean valgrind --leak-check +// [well, except for parseOptions and errCatch getStack/getThreadVars, but whatever] +hashFree(&seqIsRc); +hashFree(&ncGrcToChrom); +hashFree(&ncToTwoBitChrom); +hashFree(&ncToSeqWin); +slNameFreeList(&sources); +} + +int main(int argc, char *argv[]) +/* Process command line. */ +{ +optionInit(&argc, argv, options); +if (argc != 5) + usage(); +dbSnpJsonToTab(argv[1], argv[2], argv[3], argv[4]); +return 0; +}