acca3deffc05c4d8d11590a1cf3d893763254712
angie
Thu Oct 31 13:43:05 2019 -0700
dbSnp153: Adding new ucscNotes suggested by Ana Benet: clinvar{Benign,Conflicting,Pathogenic}, rareAll, rareSome. refs #23283
diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index d8ea261..de03bd2 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -1,1738 +1,1780 @@
/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
/* Copyright (C) 2019 The Regents of the University of California */
#include "common.h"
#include "basicBed.h"
#include "bigDbSnp.h"
#include "binRange.h"
#include "dnaseq.h"
#include "dnautil.h"
#include "errCatch.h"
#include "hash.h"
#include "indelShift.h"
#include "iupac.h"
#include "jsonQuery.h"
#include "linefile.h"
#include "obscure.h"
#include "options.h"
#include "regexHelper.h"
#include "soTerm.h"
#include "twoBit.h"
void usage()
/* Explain usage and exit. */
{
errAbort(
"dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated track files\n"
"usage:\n"
" dbSnpJsonToTab assemblyList refSeqToUcsc refsnp-chrN.json[.gz] outRoot\n"
"assemblyList is a comma-separated set of assembly_name values in JSON, e.g. 'GRCh37.p13,GRCh38.p12'\n"
"refSeqToUcsc is tab-sep, 4 columns: RefSeq acc (NC_...), assembly_name, twoBitFile, ucscName (chr...)\n"
"Track files starting with outRoot will be created:\n"
" outRoot.<assembly>.bigDbSnp: BED4+ for main dbSNP track in each assembly in assemblyList\n"
" (final columns need to be added later by bedJoinTabOffset)\n"
" outRootDetails.tab: tab-separated allele frequency counts and other details (for bedJoinTabOffset)\n"
"Additional files are written out for error reporting:\n"
" outRootFailed.json: lines of JSON input that encountered an error, for debugging/reprocessing\n"
" outRootMerged.tab: two columns, obsolete dbSNP rs# ID and current rs# ID\n"
" outRootErrors.tab: lines of error info\n"
"\n"
"\n"
"options:\n"
" -freqSourceOrder=LIST LIST is an ordered, comma-sep list of project names (no spaces)\n"
" for frequency data submitted to dbSNP, e.g. '1000genomes,GnomAD'\n"
" If given, then the bigDbSnp minorAlleleFreq array uses the order.\n"
" Otherwise minorAlleleFreq contains only one frequency, from the\n"
" project with the highest sample count reported for that variant.\n"
" -equivRegions=FILE Each line of FILE contains space-separated acc:start:end regions\n"
" that are equivalent, for example PAR1 or PAR2 on X and Y,\n"
" or an alt sequence and the corresponding portion of its chrom.\n"
" These use RefSeq accesions not UCSC chr names, so that multiple\n"
" assemblies can be described in the same file.\n"
);
}
// Globals
int freqSourceCount = 0;
char **freqSourceOrder = NULL;
struct hash *seqIsRc = NULL;
struct hash *ncGrcToChrom = NULL;
struct hash *ncToTwoBitChrom = NULL;
struct hash *ncToSeqWin = NULL;
/* Command line validation table. */
static struct optionSpec options[] = {
{"freqSourceOrder", OPTION_STRING},
{"equivRegions", OPTION_STRING},
{NULL, 0},
};
struct outStreams
/* Container for various output files that are not assembly-specific */
{
FILE *details; // outRootDetails.tab
FILE *failJson; // outRootFailed.json
FILE *merged; // outRootMerged.tab
FILE *err; // outRootErrors.tab
};
static FILE *openOutFile(char *format, char *outRoot)
/* Open a file for writing, named by format (which must have exactly one %s) and outRoot. */
{
struct dyString *name = dyStringCreate(format, outRoot);
FILE *outF = mustOpen(name->string, "w");
dyStringFree(&name);
return outF;
}
static struct outStreams *outStreamsOpen(char *outRoot)
/* Open output file handles */
{
struct outStreams *outStreams;
AllocVar(outStreams);
outStreams->details = openOutFile("%sDetails.tab", outRoot);
outStreams->failJson = openOutFile("%sFailed.json", outRoot);
outStreams->merged = openOutFile("%sMerged.tab", outRoot);
outStreams->err = openOutFile("%sErrors.tab", outRoot);
return outStreams;
}
static void outStreamsClose(struct outStreams **pOutStreams)
/* Close file handles and free struct. */
{
if (*pOutStreams != NULL)
{
struct outStreams *outStreams = *pOutStreams;
carefulClose(&outStreams->details);
carefulClose(&outStreams->failJson);
carefulClose(&outStreams->merged);
carefulClose(&outStreams->err);
freez(pOutStreams);
}
}
struct spdiBed
/* dbSNP 2.0 variant representation: {Sequence, Position, Deletion, Insertion}, expanded to
* cover ambiguous placement region for indels when applicable. Add chromEnd to get bed3+. */
{
struct spdiBed *next;
char *chrom; // Sequence ID
uint chromStart; // Position: 0-based start
uint chromEnd; // Position + number of deleted bases
char *del; // Deleted sequence (starting at pos) -- or number of deleted bases
char *ins; // Inserted sequence
};
struct alObs
// Unit of variant allele frequency observation
{
struct alObs *next;
char *allele; // Variant allele sequence
int obsCount; // Number of chromosomes on which allele was observed by a project
int totalCount; // Number of chromosomes on which project observed some allele of this variant
};
struct sharedProps
/* Properties of a refsnp that are shared by all mappings to all assemblies. */
{
// Extracted from JSON:
char *name; // rs# ID
int freqSourceCount; // Number of projects reporting allele frequency observations
struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs
struct alObs **freqSourceObs; // Array of per-project observation lists
char **freqSourceMajorAl; // Array of per-project major allele
char **freqSourceMinorAl; // Array of per-project minor allele
double *freqSourceMaf; // Array of per-project minor allele frequencies
int biggestSourceIx; // index of freqSourceObs with highest totalCount
boolean *freqIsRc; // Per-project flag for sequence isRc relative to ptlp
struct slInt *soTerms; // Sequence Ontology functional effect terms
struct slName *submitters; // dbSNP submitter handles
struct slInt *pubMedIds; // PubMed article citations
struct slName *clinVarAccs; // Accessions of ClinVar annotations, i.e. ClinVar RCV*
struct slName *clinVarSigs; // Reported ClinVar significance
enum bigDbSnpClass class; // {snv, mnv, ins, del, delins, identity}
enum soTerm maxFuncImpact; // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic}
int commonCount; // Number of projects reporting minor allele freq >= 1%
int rareCount; // Number of projects reporting minor allele freq < 1%
};
struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm)
/* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm.
* Checking validity of alleles is up to caller, but this does check whether del is a number
* instead of sequence. */
{
struct spdiBed *spdi;
lmAllocVar(lm, spdi);
spdi->chrom = seq;
spdi->chromStart = pos;
spdi->del = del;
spdi->ins = ins;
if (isAllDigits(del))
spdi->chromEnd = pos + atoi(del);
else
spdi->chromEnd = spdi->chromStart + strlen(del);
return spdi;
}
static boolean seqIsNucleotide(const char *seq)
/* Return TRUE unless seq contains a character that is not in the IUPAC nucleotide set. */
{
if (seq == NULL)
return FALSE;
const char *p;
for (p = seq; *p != '\0'; p++)
{
if (!isIupac(*p))
return FALSE;
}
return TRUE;
}
static struct spdiBed *parseSpdis(struct slRef *spdiList, char *name, struct lm *lm)
/* Make sure that all SPDI objects have the consistent seq_id, deleted_sequence and position. */
{
struct spdiBed *spdiBList = NULL;
char *firstSeq = NULL;
int firstPos = -1;
char *firstDel = NULL;
struct slRef *spdiRef;
for (spdiRef = spdiList; spdiRef != NULL; spdiRef = spdiRef->next)
{
struct jsonElement *spdiEl = spdiRef->val;
char *seq = jsonQueryString(spdiEl, "spdi", "seq_id", lm);
int pos = jsonQueryInt(spdiEl, "spdi", "position", -1, lm);
char *del = jsonQueryString(spdiEl, "spdi", "deleted_sequence", lm);
char *ins = jsonQueryString(spdiEl, "spdi", "inserted_sequence", lm);
if (seq == NULL)
errAbort("Missing or null seq_id in spdi for %s", name);
if (pos < 0)
errAbort("Invalid or missing position in spdi for %s", name);
if (del == NULL)
errAbort("Missing or null deleted_sequence in spdi for %s", name);
if (ins == NULL)
errAbort("Missing or null inserted_sequence in spdi for %s", name);
if (! seqIsNucleotide(del))
errAbort("Invalid deleted_sequence '%s' in spdi for %s", del, name);
if (! seqIsNucleotide(ins))
errAbort("Invalid inserted_sequence '%s' in spdi for %s", ins, name);
if (firstSeq == NULL)
{
// First spdi
firstSeq = seq;
firstDel = del;
firstPos = pos;
}
else
{
if (differentString(firstSeq, seq))
errAbort("Mismatching seq_id ('%s' vs '%s') in spdis for %s",
firstSeq, seq, name);
if (differentString(firstDel, del))
errAbort("Mismatching deleted sequence ('%s' vs '%s') in %s spdis for %s",
firstDel, del, seq, name);
if (firstPos != pos)
errAbort("Mismatching position (%d vs %d) in %s spdis for %s",
firstPos, pos, seq, name);
}
slAddHead(&spdiBList, spdiBedNewLm(seq, pos, del, ins, lm));
}
slReverse(&spdiBList);
return spdiBList;
}
static enum bigDbSnpClass varTypeToClass(char *varType)
/* Convert JSON variant_type to bigDbSnp enum form. */
{
if (sameString(varType, "snv"))
return bigDbSnpSnv;
else if (sameString(varType, "mnv"))
return bigDbSnpMnv;
else if (sameString(varType, "ins"))
return bigDbSnpIns;
else if (sameString(varType, "del"))
return bigDbSnpDel;
else if (sameString(varType, "delins"))
return bigDbSnpDelins;
else if (sameString(varType, "identity"))
return bigDbSnpIdentity;
else
errAbort("varTypeToClass: unrecognized variant_type '%s'.", varType);
return bigDbSnpSnv; // we never get here; appease gcc
}
static int freqSourceToIx(char *source, struct slName **pSourceList, struct lm *lm)
/* Convert source name to index using freqSourceOrder if defined, otherwise sourceList
* (expand sourceList if source is not already in the list). */
{
int ix = -1;
if (freqSourceOrder)
{
ix = stringArrayIx(source, freqSourceOrder, freqSourceCount);
if (ix < 0)
errAbort("freqSourceOrder does not contains '%s'", source);
}
else
{
ix = slNameFindIx(*pSourceList, source);
if (ix < 0)
{
ix = slCount(*pSourceList);
slAddTail(pSourceList, lmSlName(lm, source));
}
}
return ix;
}
static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource,
struct spdiBed **retSpdiB, struct lm *lm)
/* Parse one allele observation. */
{
struct alObs *obs = NULL;
lmAllocVar(lm, obs);
char *source = jsonQueryString(obsEl, "alObs", "study_name", lm);
if (source == NULL)
errAbort("parseAlObs: Missing or null study_name");
struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm);
struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm);
obs->allele = spdiB->ins;
obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm);
obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm);
if (obs->obsCount < 0)
errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)",
source, obs->allele);
if (obs->totalCount < 0)
errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)",
source, obs->allele);
if (obs->totalCount == 0)
errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)",
source, obs->allele);
*retSource = source;
*retSpdiB = spdiB;
return obs;
}
static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm)
/* If the reference allele was omitted from frequency reports (dbSNP JIRA VR-178), infer the count
* and add the reference allele to lists of spdis and obs. */
{
int ix;
for (ix = 0; ix < props->freqSourceCount; ix++)
{
if (props->freqSourceSpdis[ix] == NULL)
continue;
boolean gotRef = FALSE;
int sumCounts = 0;
struct spdiBed *spdiB;
struct alObs *obs;
for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
spdiB != NULL && obs != NULL;
spdiB = spdiB->next, obs = obs->next)
{
if (sameString(spdiB->del, spdiB->ins))
{
gotRef = TRUE;
break;
}
sumCounts += obs->obsCount;
}
if (!gotRef)
{
struct spdiBed *firstSpdiB = props->freqSourceSpdis[ix];
struct alObs *firstObs = props->freqSourceObs[ix];
if (sumCounts > firstObs->totalCount)
errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref "
"counts %d", rsId, ix, firstObs->totalCount, sumCounts);
struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart,
firstSpdiB->del, firstSpdiB->del, lm);
struct alObs *obs;
lmAllocVar(lm, obs);
obs->allele = spdiB->del;
obs->obsCount = firstObs->totalCount - sumCounts;
obs->totalCount = firstObs->totalCount;
slAddHead(&props->freqSourceSpdis[ix], spdiB);
slAddHead(&props->freqSourceObs[ix], obs);
}
}
}
static void checkFreqSourceObs(struct sharedProps *props, char *rsId)
/* After frequency allele observations have been sorted into per-project lists,
* make sure the alleles look like [ACGT]+ and the reported total_counts are consistent.
* The sum of obsCounts may be less than total_count (no-calls), but not greater. */
{
int ix;
for (ix = 0; ix < props->freqSourceCount; ix++)
{
int totalCount = 0;
struct alObs *obsList = props->freqSourceObs[ix], *obs;
for (obs = obsList; obs != NULL; obs = obs->next)
{
if (! seqIsNucleotide(obs->allele))
errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId);
if (obs->obsCount < 0)
errAbort("checkFreqSourceObs: invalid obsCount %d for %s", obs->obsCount, rsId);
if (obs->obsCount > obs->totalCount)
errAbort("checkFreqSourceObs: obsCount %d > totalCount %d for %s",
obs->obsCount, obs->totalCount, rsId);
totalCount += obs->obsCount;
}
for (obs = obsList; obs != NULL; obs = obs->next)
{
if (obs->totalCount < totalCount)
errAbort("checkFreqSourceObs: obsCounts sum to %d, but total_count is %d for %s",
totalCount, obs->totalCount, rsId);
else if (obs->totalCount != obsList->totalCount)
errAbort("checkFreqSourceObs: inconsistent totalCount (%d for '%s' != %d for '%s' "
"for %s", obs->totalCount, obs->allele, obsList->totalCount, obsList->allele,
rsId);
}
}
}
static void parseFrequencies(struct slRef *annotationsRef, struct sharedProps *props,
char *rsId, struct lm *lm)
/* Extract per-project allele frequency count data from JSON. Note: the alleles are from the
* reference assembly that the project used; current reference might be different. */
{
struct slRef *obsList = jsonQueryElementList(annotationsRef, "annotations", "frequency[*]", lm);
if (obsList != NULL)
{
if (freqSourceOrder)
{
props->freqSourceCount = freqSourceCount;
lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount);
lmAllocArray(lm, props->freqSourceObs, freqSourceCount);
lmAllocArray(lm, props->freqIsRc, freqSourceCount);
}
else
{
props->freqSourceCount = 0;
int allocSize = slCount(obsList);
lmAllocArray(lm, props->freqSourceSpdis, allocSize);
lmAllocArray(lm, props->freqSourceObs, allocSize);
lmAllocArray(lm, props->freqIsRc, allocSize);
}
struct slName *unorderedSourceList = NULL;
int maxSampleCount = 0;
int biggestSourceIx = 0;
struct slRef *obsRef;
for (obsRef = obsList; obsRef != NULL; obsRef = obsRef->next)
{
struct jsonElement *obsEl = obsRef->val;
char *source = NULL;
struct spdiBed *spdiB = NULL;
struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm);
int ix = freqSourceToIx(source, &unorderedSourceList, lm);
slAddHead(&props->freqSourceSpdis[ix], spdiB);
slAddHead(&props->freqSourceObs[ix], obs);
props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1);
if (props->freqIsRc[ix] < 0)
errAbort("parseFrequencies: Unknown orientation for %s seq_id %s",
rsId, spdiB->chrom);
if (obs->totalCount > maxSampleCount)
{
maxSampleCount = obs->totalCount;
biggestSourceIx = ix;
}
}
if (!freqSourceOrder)
props->freqSourceCount = slCount(unorderedSourceList);
int ix;
for (ix = 0; ix < props->freqSourceCount; ix++)
{
slReverse(&props->freqSourceSpdis[ix]);
slReverse(&props->freqSourceObs[ix]);
}
addMissingRefAllele(props, rsId, lm);
checkFreqSourceObs(props, rsId);
props->biggestSourceIx = biggestSourceIx;
}
}
static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm)
/* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional
* impact, discard duplicates and return as slInt list with highest impact first. */
{
struct slInt *intList = NULL;
if (soTermNames)
{
int termCount = slCount(soTermNames);
enum soTerm termArray[termCount];
struct slName *term;
int i;
for (i = 0, term = soTermNames; term != NULL; i++, term = term->next)
termArray[i] = soTermStringIdToId(term->name);
assert(i == termCount);
qsort(termArray, termCount, sizeof termArray[0], soTermCmp);
for (i = 0; i < termCount; i++)
{
if (i == 0 || termArray[i] != termArray[i-1])
{
struct slInt *intEl;
lmAllocVar(lm, intEl);
intEl->val = termArray[i];
slAddHead(&intList, intEl);
}
}
slReverse(&intList);
}
return intList;
}
struct seqWindow *getChromSeq(struct hash *ncToSeqWin, char *nc, struct hash *ncToTwoBitChrom)
/* Look up nc in ncToSeqWin. If not found, look it up in ncToTwoBitChrom, make a seqWin,
* store it in ncToSeqWin and return it. Return NULL if neither hash has nc. */
{
struct seqWindow *seqWin = hashFindVal(ncToSeqWin, nc);
if (seqWin == NULL)
{
char *twoBitChrom = hashFindVal(ncToTwoBitChrom, nc);
if (twoBitChrom != NULL)
{
struct dnaSeq *dnaSeq = twoBitLoadAll(twoBitChrom);
if (dnaSeq != NULL)
{
// Use memSeqWindow instead of twoBitSeqWindow to avoid keeping a bunch of open
// twoBitFile handles.
// Hmm, a dnaSeqWindow would be more efficient, no cloning and strlen...
// or at the least, memSeqWindowNew could take a size arg.
seqWin = memSeqWindowNew(twoBitChrom, dnaSeq->dna);
dnaSeqFree(&dnaSeq);
hashAdd(ncToSeqWin, nc, seqWin);
}
}
}
return seqWin;
}
static boolean maybeExpandRange(struct spdiBed *spdiBTrim, struct seqWindow *seqWin,
struct spdiBed *spdiB, struct alObs *obs, struct lm *lm)
/* If spdiBTrim is a pure insertion or deletion within a repetitive region,
* then expand spdiB (and obs if non-NULL) to the full repetitive range and return TRUE.
* Trim spdiBTrim to minimal representation (see maybeTrimSpdi) before calling this. */
{
boolean changed = FALSE;
uint refTrimStart = spdiBTrim->chromStart;
uint refTrimEnd = spdiBTrim->chromEnd;
char *altTrim = spdiBTrim->ins;
int refTrimLen = refTrimEnd - refTrimStart;
int altTrimLen = strlen(altTrim);
if (indelShiftIsApplicable(refTrimLen, altTrimLen))
{
// Now shift the minimal representation left and right and compare with spdiB coords.
uint leftStart = refTrimStart, leftEnd = refTrimEnd;
char leftAlt[altTrimLen+1];
safecpy(leftAlt, sizeof leftAlt, altTrim);
int leftShifted = indelShift(seqWin, &leftStart, &leftEnd, leftAlt,
INDEL_SHIFT_NO_MAX, isdLeft);
uint rightStart = refTrimStart, rightEnd = refTrimEnd;
char rightAlt[altTrimLen+1];
safecpy(rightAlt, sizeof rightAlt, altTrim);
int rightShifted = indelShift(seqWin, &rightStart, &rightEnd, rightAlt,
INDEL_SHIFT_NO_MAX, isdRight);
if (leftShifted || rightShifted)
{
changed = TRUE;
// Expand spdiB->del to cover the whole range.,
int newRefLen = rightEnd - leftStart;
spdiB->del = lmAlloc(lm, newRefLen+1);
seqWindowCopy(seqWin, leftStart, newRefLen, spdiB->del, newRefLen+1);
// Concatenate ref bases to the left, altTrim, ref bases to the right -> new alt
char refLeft[leftShifted+1];
seqWindowCopy(seqWin, leftStart, leftShifted, refLeft, sizeof refLeft);
char refRight[rightShifted+1];
seqWindowCopy(seqWin, refTrimEnd, rightShifted, refRight, sizeof refRight);
int newAltLen = leftShifted + altTrimLen + rightShifted;
spdiB->ins = lmAlloc(lm, newAltLen+1);
safef(spdiB->ins, newAltLen+1, "%s%s%s", refLeft, altTrim, refRight);
if (obs)
obs->allele = spdiB->ins;
spdiB->chromStart = leftStart;
spdiB->chromEnd = rightEnd;
}
}
return changed;
}
static void checkSpdisAfterExpansion(struct spdiBed *spdiBList, struct alObs *obsList)
/* Check spdiBList and obsList for inconsistencies; if the reference allele spdiB+obs missed out
* on the expansion then expand them. */
{
boolean alreadyExpandedRefAllele = FALSE;
struct spdiBed *spdiB;
struct alObs *obs;
for (spdiB = spdiBList, obs = obsList;
spdiB != NULL;
spdiB = spdiB->next, obs = obs->next)
{
if (strlen(spdiB->del) != spdiB->chromEnd - spdiB->chromStart)
errAbort("Inconsistent freq spdi coords & del: %s|%d|%d %s/%s",
spdiB->chrom, spdiB->chromStart, spdiB->chromEnd, spdiB->del, spdiB->ins);
if (spdiB != spdiBList)
{
if (differentString(spdiB->chrom, spdiBList->chrom))
errAbort("Differing freq spdiB chrom: '%s' vs. '%s'",
spdiB->chrom, spdiBList->chrom);
if (differentString(spdiB->del, spdiBList->del))
{
// Ref allele can't be shifted, so expand it to alt's shifted range... but only once,
// so we'll catch differences between alt alleles' expanded ranges.
struct spdiBed *spdiBRef = NULL, *spdiBAlt = NULL;
struct alObs *obsRef = NULL;
if (sameString(spdiBList->del, spdiBList->ins))
{
spdiBRef = spdiBList;
spdiBAlt = spdiB;
obsRef = obsList;
}
else if (sameString(spdiB->del, spdiB->ins))
{
spdiBRef = spdiB;
spdiBAlt = spdiBList;
obsRef = obs;
}
if (spdiBRef != NULL && !alreadyExpandedRefAllele)
{
// Copy spdiBAlt's expanded coords and del into spdiBRef
spdiBRef->chromStart = spdiBAlt->chromStart;
spdiBRef->chromEnd = spdiBAlt->chromEnd;
spdiBRef->del = spdiBAlt->del;
// The reference allele's ins is the same as its del, and the corresponding
// obs->allele must match it.
spdiBRef->ins = spdiBRef->del;
obsRef->allele = spdiBRef->ins;
alreadyExpandedRefAllele = TRUE;
}
else
errAbort("Differing freq spdiB: %s|%d|%d %s/%s vs %s|%d|%d %s/%s",
spdiBList->chrom, spdiBList->chromStart, spdiBList->chromEnd,
spdiBList->del, spdiBList->ins,
spdiB->chrom, spdiB->chromStart, spdiB->chromEnd,
spdiB->del, spdiB->ins);
}
}
}
}
static struct spdiBed *maybeTrimSpdi(struct spdiBed *spdiB, struct lm *lm)
/* Return a new spdiBed that is the minimal representation of spdiB. */
{
struct spdiBed *spdiBTrim = spdiBedNewLm(spdiB->chrom, spdiB->chromStart,
lmCloneString(lm, spdiB->del),
lmCloneString(lm, spdiB->ins), lm);
int refTrimLen = 0, altTrimLen = 0;
trimRefAlt(spdiBTrim->del, spdiBTrim->ins, &spdiBTrim->chromStart, &spdiBTrim->chromEnd,
&refTrimLen, &altTrimLen);
return spdiBTrim;
}
static void spdiNormalizeFreq(struct sharedProps *props, struct hash *ncToTwoBitChrom,
struct hash *ncToSeqWin, struct lm *lm)
/* Allele frequency counts come directly from VCF, so although they are encoded as SPDI, they do
* not cover the entire repetitive range. Expand variant alleles on the sequence on which they
* were originally reported; if there is an indel difference between the reporting assembly and
* the mapping assembly then this will resolve the actual alleles correctly (sometimes ref and alt
* may be different for different assemblies). */
{
int sIx;
for (sIx = 0; sIx < props->freqSourceCount; sIx++)
{
struct spdiBed *spdiB, *spdiBList = props->freqSourceSpdis[sIx];
struct alObs *obs, *obsList = props->freqSourceObs[sIx];
assert(slCount(spdiBList) == slCount(obsList));
boolean changed = FALSE;
// For each allele, see if it can be slid around.
for (spdiB = spdiBList, obs = obsList;
spdiB != NULL;
spdiB = spdiB->next, obs = obs->next)
{
assert(sameString(obs->allele, spdiB->ins));
// If spdiB->del can be expanded, then expand both ref and alt in spdiB and obs accordingly.
if (isEmpty(spdiB->del) || isEmpty(spdiB->ins) ||
stringIn(spdiB->del, spdiB->ins) || stringIn(spdiB->ins, spdiB->del))
{
// First trim ref and alt to their minimal version for indelShift to work with.
// There may be nothing to trim because they're usually minimal already and
// stringIn catches false positive like A/TAAC, and that's fine.
struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
// If possible, expand the minimal representation to full SPDI range.
struct seqWindow *seqWin = getChromSeq(ncToSeqWin, spdiB->chrom, ncToTwoBitChrom);
if (seqWin == NULL)
errAbort("No twoBit file and chrom found for freq seq '%s'", spdiB->chrom);
changed |= maybeExpandRange(spdiBTrim, seqWin, spdiB, obs, lm);
}
}
if (changed)
checkSpdisAfterExpansion(spdiBList, obsList);
}
}
static double mafFromSource(struct sharedProps *props, int sourceIx,
char **retMajorAllele, char **retMinorAllele, struct lm *lm)
/* Compare allele counts from source and return minor allele frequency (2nd-highest).
* Set *retMajorAllele to allele with highest count (for comparison with mapped ref alleles).
* If there are no observations from source, return NAN (divide by zero). */
{
char *majorAllele = NULL, *minorAllele = NULL;
int majorAlleleCount = -1, minorAlleleCount = -1;
double totalCount = 0;
struct alObs *obs;
for (obs = props->freqSourceObs[sourceIx]; obs != NULL; obs = obs->next)
{
if (obs->obsCount > majorAlleleCount)
{
minorAllele = majorAllele;
minorAlleleCount = majorAlleleCount;
majorAlleleCount = obs->obsCount;
majorAllele = obs->allele;
}
else if (obs->obsCount > minorAlleleCount)
{
minorAllele = obs->allele;
minorAlleleCount = obs->obsCount;
}
totalCount = obs->totalCount;
}
if (totalCount > 0)
{
if (majorAlleleCount < 0)
errAbort("no major allele observed despite nonzero totalCount...?");
if (minorAlleleCount < 0)
errAbort("no minor allele observed... what should the minorAllele value be??");
}
*retMajorAllele = lmCloneString(lm, majorAllele);
*retMinorAllele = lmCloneString(lm, minorAllele);
// If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test)
return (double)minorAlleleCount / totalCount;
}
static void sortFrequencies(struct sharedProps *props, struct lm *lm)
/* Determine major/minor allele and per-project MAF (if given).
* If -freqOrder option was given, then fill in props->freqSource* arrays in that order.
* Otherwise, choose the MAF from the project with the largest sample count as a fallback. */
{
if (props->freqSourceCount > 0)
{
if (freqSourceOrder != NULL)
{
lmAllocArray(lm, props->freqSourceMajorAl, freqSourceCount);
lmAllocArray(lm, props->freqSourceMinorAl, freqSourceCount);
lmAllocArray(lm, props->freqSourceMaf, freqSourceCount);
int sIx;
for (sIx = 0; sIx < freqSourceCount; sIx++)
{
props->freqSourceMaf[sIx] = mafFromSource(props, sIx,
&props->freqSourceMajorAl[sIx],
&props->freqSourceMinorAl[sIx], lm);
if (isfinite(props->freqSourceMaf[sIx]))
{
if (props->freqSourceMaf[sIx] >= 0.01)
props->commonCount++;
else
props->rareCount++;
}
}
}
else
{
// No freqSourceOrder -- just use source with biggest totalCount for this variant.
props->freqSourceCount = 1;
lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount);
lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount);
lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount);
props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx,
&props->freqSourceMajorAl[0],
&props->freqSourceMinorAl[0], lm);
if (isfinite(props->freqSourceMaf[0]))
{
if (props->freqSourceMaf[0] >= 0.01)
props->commonCount++;
else
props->rareCount++;
}
}
}
}
static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm)
/* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */
{
struct sharedProps *props = NULL;
lmAllocVar(lm, props);
char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm);
char rsName[strlen(rsNumber)+3];
safef(rsName, sizeof rsName, "rs%s", rsNumber);
props->name = lmCloneString(lm, rsName);
char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm);
props->class = varTypeToClass(varType);
struct slRef *annotationsRef = jsonQueryElement(top, "top",
"primary_snapshot_data.allele_annotations[*]", lm);
parseFrequencies(annotationsRef, props, rsName, lm);
spdiNormalizeFreq(props, ncToTwoBitChrom, ncToSeqWin, lm);
sortFrequencies(props, lm);
struct slName *soTermNames = jsonQueryStringList(annotationsRef, "annotations",
"assembly_annotation[*].genes[*].rnas[*]"
".sequence_ontology[*].accession", lm);
soTermNames = slCat(soTermNames,
jsonQueryStringList(annotationsRef, "annotations",
"assembly_annotation[*].genes[*].rnas[*]"
".protein.sequence_ontology[*].accession", lm));
props->soTerms = soTermStringIdToIdList(soTermNames, lm);
props->maxFuncImpact = props->soTerms ? props->soTerms->val : soUnknown;
props->submitters = jsonQueryStrings(top, "top",
"primary_snapshot_data.support[id.type=subsnp].submitter_handle",
lm);
slUniqify(&props->submitters, slNameCmp, NULL);
props->pubMedIds = jsonQueryInts(top, "top", "citations[*]", lm);
props->clinVarAccs = jsonQueryStringList(annotationsRef, "annotations",
"clinical[*].accession_version", lm);
props->clinVarSigs = jsonQueryStringList(annotationsRef, "annotations",
"clinical[*].clinical_significances[*]", lm);
return props;
}
static void setBdsFreqData(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds,
struct dyString *dyUcscNotes, struct lm *lm)
/* Use isRc, props to fill in bds->majorAllele, ->minorAllele, determine whether ref is maj/min.
* Add notes to dyUcscNotes if applicable. */
{
if (props->freqSourceCount > 0)
{
bds->freqSourceCount = props->freqSourceCount;
lmAllocArray(lm, bds->majorAllele, props->freqSourceCount);
lmAllocArray(lm, bds->minorAllele, props->freqSourceCount);
lmAllocArray(lm, bds->minorAlleleFreq, props->freqSourceCount);
char *firstMajorAllele = NULL;
int sIx;
for (sIx = 0; sIx < props->freqSourceCount; sIx++)
{
boolean freqIsRc = props->freqIsRc[sIx];
bds->minorAlleleFreq[sIx] = props->freqSourceMaf[sIx];
bds->majorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMajorAl[sIx]));
bds->minorAllele[sIx] = lmCloneString(lm, emptyForNull(props->freqSourceMinorAl[sIx]));
if (isfinite(bds->minorAlleleFreq[sIx]))
{
if (isRc != freqIsRc)
{
reverseComplement(bds->majorAllele[sIx], strlen(bds->majorAllele[sIx]));
reverseComplement(bds->minorAllele[sIx], strlen(bds->minorAllele[sIx]));
}
char *majorAllele = bds->majorAllele[sIx];
if (differentString(bds->ref, majorAllele))
dyStringAppend(dyUcscNotes, bdsRefIsMinor ",");
if (firstMajorAllele == NULL)
firstMajorAllele = majorAllele;
else if (majorAllele != NULL &&
differentString(firstMajorAllele, majorAllele))
dyStringAppend(dyUcscNotes, bdsDiffMajor ",");
}
}
}
}
static void checkRareRef(struct sharedProps *props, char *ref, boolean isRc,
struct dyString *dyUcscNotes)
/* If frequency counts are reported, test whether the reference has ever been observed,
* and if so whether it is rare, i.e. < 1%. If so, add to dyUcscNotes. */
{
double rareThreshold = 0.01;
boolean haveFrequencies = FALSE, refWasObserved = FALSE;
double maxRefAF = 0.0;
int ix;
for (ix = 0; ix < props->freqSourceCount; ix++)
{
if (props->freqSourceObs[ix] != NULL)
haveFrequencies = TRUE;
boolean freqIsRc = props->freqIsRc[ix];
int refLen = strlen(ref);
char refCpy[refLen+1];
safecpy(refCpy, sizeof refCpy, ref);
if (isRc != freqIsRc)
reverseComplement(refCpy, refLen);
struct spdiBed *spdiB;
struct alObs *obs;
for (spdiB = props->freqSourceSpdis[ix], obs = props->freqSourceObs[ix];
spdiB != NULL && obs != NULL;
spdiB = spdiB->next, obs = obs->next)
{
if (sameString(refCpy, spdiB->ins))
{
if (obs->obsCount > 0)
{
refWasObserved = TRUE;
double refAF = (double)obs->obsCount / (double)obs->totalCount;
if (refAF > maxRefAF)
maxRefAF = refAF;
}
break;
}
}
if (maxRefAF >= rareThreshold)
break;
}
if (haveFrequencies && maxRefAF < rareThreshold)
{
dyStringAppend(dyUcscNotes, bdsRefIsRare ",");
if (!refWasObserved)
dyStringAppend(dyUcscNotes, bdsRefIsSingleton ",");
}
}
static enum bigDbSnpClass bigDbSnpClassFromAlleles(struct bigDbSnp *bds)
/* Return what we expect bigDbSnpClass to be based on allele lengths. */
{
enum bigDbSnpClass alleleClass = 0;
if (bds->altCount == 0)
alleleClass = bigDbSnpIdentity;
else
{
int refLen = strlen(bds->ref);
boolean altLenZero = TRUE;
boolean altLenLtRef = FALSE;
boolean altLenEqRef = FALSE;
boolean altLenGtRef = FALSE;
int i;
for (i = 0; i < bds->altCount; i++)
{
int altLen = strlen(bds->alts[i]);
if (altLen < refLen)
altLenLtRef = TRUE;
else if (altLen == refLen)
altLenEqRef = TRUE;
else if (altLen > refLen)
altLenGtRef = TRUE;
if (altLen > 0)
altLenZero = FALSE;
}
if (!altLenLtRef && altLenEqRef && !altLenGtRef)
{
if (refLen == 1)
alleleClass = bigDbSnpSnv;
else
alleleClass = bigDbSnpMnv;
}
else if (bds->altCount == 0)
alleleClass = bigDbSnpIdentity;
else if (refLen == 0)
alleleClass = bigDbSnpIns;
else if (altLenZero)
alleleClass = bigDbSnpDel;
else
alleleClass = bigDbSnpDelins;
}
return alleleClass;
}
static void checkIndelPlacement(struct bigDbSnp *bds, struct seqWindow *seqWin,
struct lm *lm)
/* If bds is/has a pure insertion or deletion in a repetitive genomic region, make sure that
* bds's SPDI-based coords cover the whole repetitive region as expected. */
{
int i;
for (i = 0; i < bds->altCount; i++)
{
char *alt = bds->alts[i];
if (isEmpty(bds->ref) || isEmpty(alt) ||
stringIn(bds->ref, alt) || stringIn(alt, bds->ref))
{
// First trim ref and alt to their minimal version for indelShift to work with.
// There may be nothing to trim because they're usually minimal already and
// stringIn catches false positive like A/TAAC, and that's fine.
struct spdiBed *spdiB = spdiBedNewLm(bds->chrom, bds->chromStart, bds->ref, alt, lm);
struct spdiBed *spdiBTrim = maybeTrimSpdi(spdiB, lm);
maybeExpandRange(spdiBTrim, seqWin, spdiB, NULL, lm);
if (spdiB->chromStart != bds->chromStart || spdiB->chromEnd != bds->chromEnd)
errAbort("Range of %s (%s|%d|%d ref='%s', alt='%s') "
"could be expanded to %s|%d|%d\n",
bds->name, bds->chrom, bds->chromStart, bds->chromEnd, bds->ref, alt,
bds->chrom, spdiB->chromStart, spdiB->chromEnd);
}
}
}
+static void addClinVarSigs(struct dyString *dyUcscNotes, struct sharedProps *props)
+/* If clinVarSigs indicate benign, pathogenic, or both (conflicting), add ucscNote. */
+{
+boolean isBenign = FALSE, isPathogenic = FALSE;
+struct slName *sig;
+for (sig = props->clinVarSigs; sig != NULL; sig = sig->next)
+ {
+ if (sameString(sig->name, "likely-benign") ||
+ sameString(sig->name, "benign") ||
+ sameString(sig->name, "benign-likely-benign"))
+ {
+ isBenign = TRUE;
+ }
+ else if (sameString(sig->name, "pathogenic") ||
+ sameString(sig->name, "likely-pathogenic") ||
+ sameString(sig->name, "pathogenic-likely-pathogenic"))
+ {
+ isPathogenic = TRUE;
+ }
+ else if (sameString(sig->name, "conflicting-interpretations-of-pathogenicity"))
+ {
+ isBenign = TRUE;
+ isPathogenic = TRUE;
+ break;
+ }
+ }
+if (isBenign && isPathogenic)
+ dyStringAppend(dyUcscNotes, bdsClinvarConflicting ",");
+else if (isBenign)
+ dyStringAppend(dyUcscNotes, bdsClinvarBenign ",");
+else if (isPathogenic)
+ dyStringAppend(dyUcscNotes, bdsClinvarPathogenic ",");
+}
+
static boolean delMismatchesGenome(struct bigDbSnp *bds, struct seqWindow *seqWin)
/* Return TRUE if bds->ref (spdi del) does not match assembly sequence at bds coords.
* Sometimes the genome has an N and dbSNP has a more specific IUPAC character.
* errAbort if there's some other kind of inconsistency. */
{
int refLen = strlen(bds->ref);
uint refStart = bds->chromStart;
uint refEnd = bds->chromEnd;
if (refLen != refEnd - refStart)
errAbort("Inconsistent ref and coords for %s: ref is '%s' (%d bases) "
"but position is %s|%d|%d (%d bases)",
bds->name, bds->ref, refLen, bds->chrom, refStart, refEnd, refEnd - refStart);
char genomeRef[refLen+1];
seqWindowCopy(seqWin, refStart, refLen, genomeRef, sizeof genomeRef);
if (differentString(genomeRef, bds->ref))
{
if (strchr(genomeRef, 'N'))
return TRUE;
else
errAbort("bds->ref '%s' mismatches genome sequence '%s'", bds->ref, genomeRef);
}
return FALSE;
}
static boolean hasAmbiguousFreqAllele(struct sharedProps *props)
/* Return TRUE if any allele from any frequency source contains an IUPAC ambiguous character. */
{
int sIx;
for (sIx = 0; sIx < props->freqSourceCount; sIx++)
{
struct alObs *obs;
for (obs = props->freqSourceObs[sIx]; obs != NULL; obs = obs->next)
if (anyIupac(obs->allele))
return TRUE;
}
return FALSE;
}
static boolean freqHasNonRefAltAllele(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds)
/* Return TRUE if any allele from any frequency source does not match any ref/alt (SPDI) allele. */
{
int sIx;
for (sIx = 0; sIx < props->freqSourceCount; sIx++)
{
boolean freqIsRc = props->freqIsRc[sIx];
struct alObs *obs;
for (obs = props->freqSourceObs[sIx]; obs != NULL; obs = obs->next)
{
int alLen = strlen(obs->allele);
char alCpy[alLen+1];
safecpy(alCpy, sizeof alCpy, obs->allele);
if (freqIsRc != isRc)
reverseComplement(alCpy, alLen);
boolean foundIt = FALSE;
if (sameString(alCpy, bds->ref))
foundIt = TRUE;
else
{
int i;
for (i = 0; i < bds->altCount; i++)
if (sameString(alCpy, bds->alts[i]))
{
foundIt = TRUE;
break;
}
}
if (!foundIt)
return TRUE;
}
}
return FALSE;
}
static void addUcscNotes(struct sharedProps *props, boolean isRc, boolean isMultiMapper,
struct bigDbSnp *bds, struct seqWindow *seqWin,
struct dyString *dyUcscNotes, struct lm *lm)
/* Record interesting conditions if applicable. */
{
if (props->class != bigDbSnpClassFromAlleles(bds))
dyStringAppend(dyUcscNotes, bdsClassMismatch ",");
if (props->clinVarAccs != NULL)
dyStringAppend(dyUcscNotes, bdsClinvar ",");
+addClinVarSigs(dyUcscNotes, props);
if (props->commonCount > 0)
{
dyStringAppend(dyUcscNotes, bdsCommonSome ",");
if (props->rareCount == 0)
dyStringAppend(dyUcscNotes, bdsCommonAll ",");
+ else
+ dyStringAppend(dyUcscNotes, bdsRareSome ",");
+ }
+else if (props->rareCount > 0 || props->freqSourceCount == 0)
+ {
+ dyStringAppend(dyUcscNotes, bdsRareSome ",");
+ dyStringAppend(dyUcscNotes, bdsRareAll ",");
}
if (isRc)
dyStringAppend(dyUcscNotes, bdsRevStrand ",");
if (isMultiMapper)
dyStringAppend(dyUcscNotes, bdsMultiMap ",");
checkRareRef(props, bds->ref, isRc, dyUcscNotes);
if (delMismatchesGenome(bds, seqWin))
dyStringAppend(dyUcscNotes, bdsRefMismatch ",");
if (anyIupac(bds->ref))
dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ",");
int i;
for (i = 0; i < bds->altCount; i++)
if (anyIupac(bds->alts[i]))
dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ",");
if (hasAmbiguousFreqAllele(props))
dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ",");
if (freqHasNonRefAltAllele(props, isRc, bds))
dyStringAppend(dyUcscNotes, bdsFreqNotRefAlt ",");
bds->ucscNotes = lmCloneString(lm, dyUcscNotes->string);
}
static void calcShiftBases(struct bigDbSnp *bds)
/* Return the max length of uncertain placement from all alt alleles. */
{
int maxBasesTrimmed = 0;
int refLen = bds->chromEnd - bds->chromStart;
char refTrim[refLen+1];
int i;
for (i = 0; i < bds->altCount; i++)
{
// Make copies of ref and alt, trim the copies, measure the change.
int refTrimLen = refLen;
safecpy(refTrim, sizeof refTrim, bds->ref);
int altTrimLen = strlen(bds->alts[i]);
char altTrim[altTrimLen+1];
safecpy(altTrim, sizeof altTrim, bds->alts[i]);
uint start = bds->chromStart, end = bds->chromEnd;
trimRefAlt(refTrim, altTrim, &start, &end, &refTrimLen, &altTrimLen);
int basesTrimmed = refLen - refTrimLen;
if (basesTrimmed > maxBasesTrimmed)
maxBasesTrimmed = basesTrimmed;
if (maxBasesTrimmed >= refLen)
break;
}
bds->shiftBases = min(refLen, maxBasesTrimmed);
}
static struct bigDbSnp *parsePlacement(struct jsonElement *pl, struct sharedProps *props,
boolean isMultiMapper, struct dyString *dy, struct lm *lm)
/* Return a bigDbSnp with information about one placement (mapping). */
{
struct bigDbSnp *bds;
lmAllocVar(lm, bds);
struct slRef *allSpdiRef = jsonQueryElement(pl, "placement", "alleles[*].allele.spdi", lm);
bds->chrom = jsonQueryString(pl, "placement", "seq_id", lm);
if (bds->chrom == NULL)
errAbort("Missing or NULL seq_id for %s", props->name);
struct spdiBed *spdiBList = parseSpdis(allSpdiRef, props->name, lm);
if (!sameString(bds->chrom, spdiBList->chrom))
errAbort("seq_id mismatch: placement has '%s' but spdi has '%s for %s'",
bds->chrom, spdiBList->chrom, props->name);
bds->chromStart = spdiBList->chromStart;
bds->chromEnd = spdiBList->chromEnd;
bds->name = props->name;
bds->ref = spdiBList->del;
// spdiBList may or may not include no-change allele (del == ins)
lmAllocArray(lm, bds->alts, slCount(spdiBList));
struct spdiBed *spdiB;
for (spdiB = spdiBList; spdiB != NULL; spdiB = spdiB->next)
{
if (!sameString(bds->ref, spdiB->ins))
bds->alts[bds->altCount++] = spdiB->ins;
}
calcShiftBases(bds);
boolean isRc = jsonQueryBoolean(pl, "placement",
"placement_annot.is_aln_opposite_orientation", FALSE, lm);
dyStringClear(dy);
setBdsFreqData(props, isRc, bds, dy, lm);
bds->maxFuncImpact = props->maxFuncImpact;
bds->class = props->class;
struct seqWindow *seqWin = getChromSeq(ncToSeqWin, bds->chrom, ncToTwoBitChrom);
checkIndelPlacement(bds, seqWin, lm);
addUcscNotes(props, isRc, isMultiMapper, bds, seqWin, dy, lm);
return bds;
}
static void equivRegionAdd(struct hash *equivRegions, struct bed3 *region, struct bed3 *regionList)
/* Associate regionList with region by looking up the binKeeper for region->chrom in equivRegions
* and adding regionList to the binKeeper for region start & end. */
{
struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
if (bk == NULL)
{
bk = binKeeperNew(0, BINRANGE_MAXEND_512M);
hashAdd(equivRegions, region->chrom, bk);
}
binKeeperAdd(bk, region->chromStart, region->chromEnd, regionList);
}
static struct hash *getEquivRegions()
/* If not done already, read -equivRegions file if specified and parse it into data structure
* of equivalent regions for detecting legitimate cases of multiple mappings. */
{
static struct hash *equivRegions = NULL;
static boolean alreadyInited = FALSE;
if (!alreadyInited)
{
char *equivRegionsFile = optionVal("equivRegions", NULL);
if (isNotEmpty(equivRegionsFile))
{
equivRegions = hashNew(0);
struct lineFile *lf = lineFileOpen(equivRegionsFile, TRUE);
char *line;
while (lineFileNextReal(lf, &line))
{
// Build a list of equivalent regions; hash each region->chrom to list.
char *words[128];
int wordCount = chopLine(line, words);
if (wordCount >= ArraySize(words))
lineFileAbort(lf, "getEquivRegions: too many words per line (%d)", wordCount);
struct bed3 *regionList = NULL;
int i;
for (i = 0; i < wordCount; i++)
{
char *chrom = cloneString(words[i]);
char *colon = strchr(chrom, ':');
if (colon)
{
char *startStr = colon+1;
char *nextColon = strchr(startStr, ':');
if (! nextColon)
lineFileAbort(lf, "getEquivRegions: expected each word to have "
"0 or 2 colons, but got '%s'", words[i]);
*colon = '\0';
*nextColon = '\0';
char *endStr = nextColon+1;
if (!isAllDigits(startStr) || !isAllDigits(endStr))
lineFileAbort(lf, "getEquivRegions: expected a number following each colon, "
"but got '%s'", words[i]);
int start = atoi(startStr);
int end = atoi(endStr);
slAddHead(®ionList, bed3New(chrom, start, end));
}
else
slAddHead(®ionList, bed3New(chrom, 0, 0));
}
// Sort so that a main chromosome goes before alts/fixes.
slSort(®ionList, bedCmp);
// Associate all regions with the same list. we can use pointer equality later.
struct bed3 *region;
for (region = regionList; region != NULL; region = region->next)
equivRegionAdd(equivRegions, region, regionList);
}
lineFileClose(&lf);
}
else
warn("-equivRegions=FILE not specified; will not be able to detect variants "
"that are mapped to multiple non-equivalent regions.");
alreadyInited = TRUE;
}
return equivRegions;
}
static struct slRef *equivRegionFind(struct hash *equivRegions, struct bed3 *region,
struct lm *lm)
/* Return ref list of equivalent region lists that overlap region. */
{
struct binElement *beList = NULL;
struct binKeeper *bk = hashFindVal(equivRegions, region->chrom);
if (bk)
{
int start = region->chromStart;
int end = region->chromEnd;
// binKeeperFind can't find overlap with 0-length insertions (start == end) so expand those:
if (start == end)
{
start--;
end++;
}
beList = binKeeperFind(bk, start, end);
}
struct slRef *refList = NULL;
struct binElement *binEl;
for (binEl = beList; binEl != NULL; binEl = binEl->next)
lmRefAdd(lm, &refList, binEl->val);
slReverse(&refList);
slFreeList(&beList);
return refList;
}
static struct bed3 *bed3FromPlRef(struct slRef *plRef, struct lm *lm)
/* Extract a bed3 from the first SPDI found in the first placement in plRef. */
{
struct jsonElement *pl = plRef->val;
struct slRef *spdiRef = jsonQueryElement(pl, "placement", "alleles[0].allele.spdi", lm);
struct jsonElement *spdiEl = spdiRef->val;
struct bed3 *region;
lmAllocVar(lm, region);
region->chrom = jsonQueryString(spdiEl, "first spdi", "seq_id", lm);
region->chromStart = jsonQueryInt(spdiEl, "first spdi", "position", -1, lm);
char *ref = jsonQueryString(spdiEl, "first spdi", "deleted_sequence", lm);
region->chromEnd = region->chromStart + strlen(ref);
return region;
}
static struct bed3 *bed3ListFromPlRef(struct slRef *plRefList, struct lm *lm)
/* Return a list of bed3 derived from placement SPDIs in plRef. */
{
struct bed3 *list = NULL;
struct slRef *plRef;
for (plRef = plRefList; plRef != NULL; plRef = plRef->next)
slAddHead(&list, bed3FromPlRef(plRef, lm));
slReverse(&list);
return list;
}
static boolean uniqChroms(struct bed3 *bedList)
/* Return TRUE if each item of *sorted* bedList has a different chrom. */
{
boolean uniq = TRUE;
if (bedList != NULL)
{
struct bed3 *bed;
for (bed = bedList; bed->next != NULL; bed = bed->next)
if (sameString(bed->chrom, bed->next->chrom))
{
uniq = FALSE;
break;
}
}
return uniq;
}
static boolean allMappingsHaveMatch(struct slRef *equivRegionsA[], struct slRef *equivRegionsB[],
int plCount)
// Return true if each placement's list of equivRegion sets in A has at least one set that
// matches at least one set from some other placement's list in B. A and B may/may not be the same.
{
int i;
for (i = 0; i < plCount; i++)
{
struct slRef *plEquivList = equivRegionsA[i];
boolean foundMatch = FALSE;
int j;
for (j = 0; j < plCount; j++)
{
if (j == i)
continue;
struct slRef *otherEquivList = equivRegionsB[j];
struct slRef *plEquiv;
for (plEquiv = plEquivList; plEquiv != NULL; plEquiv = plEquiv->next)
{
struct slRef *otherEquiv;
for (otherEquiv = otherEquivList; otherEquiv; otherEquiv = otherEquiv->next)
{
// We can use pointer equality because getEquivRegions assigns same list
// pointer to each region in list.
if (plEquiv->val == otherEquiv->val)
{
foundMatch = TRUE;
break;
}
}
}
}
if (! foundMatch)
return FALSE;
}
return TRUE;
}
static boolean detectMultiMappers(struct slRef *placements, struct lm *lm)
/* Return TRUE if placements map to more than one genomic locus, aside from legitimate cases
* like the PAR regions on X and Y, or alt/fix sequences and their corresponding chromosomal
* locations. */
{
struct hash *equivRegions = getEquivRegions();
int plCount = slCount(placements);
if (equivRegions && plCount > 1)
{
// Get a bed3 list of placements and make sure there aren't multiple mappings on the same chrom.
struct bed3 *bedList = bed3ListFromPlRef(placements, lm);
slSort(&bedList, bedCmp);
if (!uniqChroms(bedList))
return TRUE;
// Gather up equivRegions lists for all placements. If any placement has no equivRegions,
// then we're done.
struct slRef *plEquivRegions[plCount];
struct bed3 *bed;
int i;
for (i = 0, bed = bedList; bed != NULL; bed = bed->next, i++)
{
struct slRef *listOfLists = equivRegionFind(equivRegions, bed, lm);
if (listOfLists == NULL)
return TRUE;
plEquivRegions[i] = listOfLists;
}
if (!allMappingsHaveMatch(plEquivRegions, plEquivRegions, plCount))
{
// Sometimes the test for equivalent regions fails because all regions point to a
// common location -- but that location does not have a mapping, so its suite of
// regions are not in the mix. For example, a variant is mapped to 4 alts of the
// same chrom -- but not to the chrom itself, so all regions point to the main chrom,
// but none of the 4 mappings' regions are in common with each other.
// So try harder before giving up: add equivalent regions that overlap original equivs.
struct slRef *plExpandedEquivs[plCount];
for (i = 0, bed = bedList; bed != NULL; bed = bed->next, i++)
{
struct slRef *expandedEquivs = NULL;
struct slRef *plEquiv;
for (plEquiv = plEquivRegions[i]; plEquiv != NULL; plEquiv = plEquiv->next)
{
struct slRef *setRef;
for (setRef = plEquiv; setRef != NULL; setRef = setRef->next)
{
struct bed3 *set = setRef->val;
struct bed3 *region;
for (region = set; region != NULL; region = region->next)
{
if (differentString(bed->chrom, region->chrom))
expandedEquivs = slCat(equivRegionFind(equivRegions, region, lm),
expandedEquivs);
}
}
}
plExpandedEquivs[i] = expandedEquivs;
}
if (!allMappingsHaveMatch(plEquivRegions, plExpandedEquivs, plCount))
return TRUE;
}
}
return FALSE;
}
static void dyStringPrintSlNameList(struct dyString *dy, struct slName *list, char *sep)
/* Append list of names with separator to dy; separator appears after every item. Skip empty names. */
{
struct slName *item;
for (item = list; item != NULL; item = item->next)
{
if (isNotEmpty(item->name))
{
dyStringAppend(dy, item->name);
dyStringAppend(dy, sep);
}
}
}
static void dyStringPrintSlIntList(struct dyString *dy, struct slInt *list, char *sep)
/* Append list of names with separator to dy; separator appears after every item. */
{
struct slInt *item;
for (item = list; item != NULL; item = item->next)
{
dyStringPrintf(dy, "%d", item->val);
dyStringAppend(dy, sep);
}
}
static void appendFrequencies(struct sharedProps *props, struct dyString *dy)
/* Condense the frequency data in props into an autoSql array of compact strings and append to dy.
* Also add autoSql int array of total_count because those may vary between variants. */
{
dyStringPrintf(dy, "%d\t", props->freqSourceCount);
int ix;
for (ix = 0; ix < props->freqSourceCount; ix++)
{
boolean freqIsRc = props->freqIsRc[ix];
struct alObs *obs;
for (obs = props->freqSourceObs[ix]; obs != NULL; obs = obs->next)
{
if (obs != props->freqSourceObs[ix])
dyStringAppendC(dy, '|');
if (freqIsRc)
// Store all frequencies on + strand of "preferred top level placement"
reverseComplement(obs->allele, strlen(obs->allele));
dyStringPrintf(dy, "%s:%d", obs->allele, obs->obsCount);
if (freqIsRc)
// rc again to restore original values
reverseComplement(obs->allele, strlen(obs->allele));
}
dyStringAppendC(dy, ',');
}
dyStringAppendC(dy, '\t');
for (ix = 0; ix < props->freqSourceCount; ix++)
{
if (props->freqSourceObs[ix] != NULL)
dyStringPrintf(dy, "%d,", props->freqSourceObs[ix]->totalCount);
else
dyStringAppend(dy, "0,");
}
}
static void writeDetails(struct sharedProps *props, struct dyString *dy, FILE *outPropsF)
/* Write props out as tab-sep line to outPropsF. */
{
dyStringClear(dy);
dyStringPrintf(dy, "%s\t", props->name);
appendFrequencies(props, dy);
dyStringPrintf(dy, "\t%d\t", slCount(props->soTerms));
dyStringPrintSlIntList(dy, props->soTerms, ",");
dyStringPrintf(dy, "\t%d\t", slCount(props->clinVarAccs));
dyStringPrintSlNameList(dy, props->clinVarAccs, ",");
dyStringAppendC(dy, '\t');
dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
dyStringPrintf(dy, "\t%d\t", slCount(props->submitters));
dyStringPrintSlNameList(dy, props->submitters, ",");
dyStringPrintf(dy, "\t%d\t", slCount(props->pubMedIds));
dyStringPrintSlIntList(dy, props->pubMedIds, ",");
dyStringAppendC(dy, '\n');
fputs(dy->string, outPropsF);
}
static void dumpLineToFile(char *line, FILE *jsonF)
/* Write line to jsonF, making sure it ends in \n. */
{
fputs(line, jsonF);
if (line[strlen(line)-1] != '\n')
fputc('\n', jsonF);
}
static void updateSeqIsRc(struct slRef *placements, struct lm *lm)
/* Note is_aln_opposite_orientation flag for each mapped sequence, regardless of whether it's
* in the assembly that we're working on; later we can look up sequences from freq reports. */
{
hashIntReset(seqIsRc);
struct slRef *plRef;
for (plRef = placements; plRef != NULL; plRef = plRef->next)
{
char *seqId = jsonQueryString(plRef->val, "placement", "seq_id", lm);
boolean isRc = jsonQueryBoolean(plRef->val, "placement",
"placement_annot.is_aln_opposite_orientation", FALSE, lm);
struct hashEl *hel = hashLookup(seqIsRc, seqId);
if (hel)
{
if (isRc != ptToInt(hel->val))
errAbort("updateSeqIsRc: sequence '%s' has both true and false for "
"placement_annot.is_aln_opposite_orientation", seqId);
}
else
hashAddInt(seqIsRc, seqId, isRc);
}
}
struct perAssembly
// For each assembly whose mappings we're extracting from JSON, we need the name,
// a jsonQuery path to extract just the mappings for that assembly, and an output file.
{
struct perAssembly *next;
char *name; // assembly_name e.g. GRCh38.p12
char *placementPath; // jsonQuery path to placements on assembly
char *plIsTopLevelPath; // jsonQuery path to find whether placement is top level in assembly
FILE *outF; // bigDbSnp output file
};
static void perAssemblyFree(struct perAssembly **pPa)
/* Close *pPa's outF and free mem. */
{
if (pPa)
{
struct perAssembly *pa = *pPa;
freeMem(pa->name);
carefulClose(&pa->outF);
freeMem(pa->placementPath);
freeMem(pa->plIsTopLevelPath);
freez(pPa);
}
}
static void perAssemblyFreeList(struct perAssembly **pPaList)
/* Close & free a list of perAssembly. */
{
if (pPaList)
{
struct perAssembly *el, *next;
for (el = *pPaList; el != NULL; el = next)
{
next = el->next;
perAssemblyFree(&el);
}
}
}
static void writeMerged(struct jsonElement *top, FILE *outMergedF, struct lm *lm)
/* If any merged IDs are given, write the mappings out to outMergedF. */
{
struct slName *mergedIds = jsonQueryStrings(top, "top", "dbsnp1_merges[*].merged_rsid", lm);
if (mergedIds)
{
char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
struct slName *mergedId;
for (mergedId = mergedIds; mergedId != NULL; mergedId = mergedId->next)
fprintf(outMergedF, "rs%s\trs%s\n", mergedId->name, rsId);
}
}
static char *getChrom(char *refSeqAcc, char *assembly)
/* Look up "refSeqAcc assembly" in ncGrcToChrom; if found, return chrom, otherwise return
* refSeqAcc for translation later (e.g. NC_012920.1 liftOver to hg19 chrM). Don't free ret. */
{
char ncGrc[strlen(refSeqAcc) + 1 + strlen(assembly) + 1];
safef(ncGrc, sizeof ncGrc, "%s %s", refSeqAcc, assembly);
char *chrom = hashFindVal(ncGrcToChrom, ncGrc);
return chrom ? chrom : refSeqAcc;
}
static void parseDbSnpJson(char *line, struct perAssembly *assemblyProps,
struct outStreams *outStreams)
/* Each line of dbSNP's file contains one JSON blob describing an rs# variant. Parse JSON,
* extract the bits that we want to keep, print out BED+ and accompanying files. */
{
struct lm *lm = lmInit(1<<16);
struct jsonElement *top = jsonParseLm(line, lm);
struct sharedProps *props = NULL;
struct bigDbSnp *bds = NULL;
struct slRef *allPlacements = jsonQueryElement(top, "top",
"primary_snapshot_data.placements_with_allele[*]",
lm);
updateSeqIsRc(allPlacements, lm);
struct dyString *dyScratch = dyStringNew(0);
struct perAssembly *ap;
for (ap = assemblyProps; ap != NULL; ap = ap->next)
{
struct slRef *placements = jsonQueryElement(top, "top", ap->placementPath, lm);
// Filter out scaffolds that are independent in some other assembly, but part of a regular
// chrom in the current assembly
struct slRef *plRef, *plRefNext, *filtered = NULL;
for (plRef = placements; plRef != NULL; plRef = plRefNext)
{
plRefNext = plRef->next;
struct jsonElement *pl = plRef->val;
boolean isTopLevel = jsonQueryBoolean(pl, "placement", ap->plIsTopLevelPath, FALSE, lm);
if (isTopLevel)
slAddHead(&filtered, plRef);
}
slReverse(&filtered);
placements = filtered;
boolean multiMapper = detectMultiMappers(placements, lm);
struct errCatch *errCatch = errCatchNew();
//#*** NOTE: currently in some cases the alleles are incorrect on alt sequences where the
//#*** alt has a different allele of the same variant (or just different sequence).
//#*** We detect at least some of those by inconsistent pos/del in alt placement spdis,
//#*** but who knows, there could be more. Ultimately we will need to compare assembly
//#*** sequences... unless/until dbSNP fixes their projection onto alts (JIRA VR-176).
if (errCatchStart(errCatch))
{
for (plRef = placements; plRef != NULL; plRef = plRef->next)
{
struct jsonElement *pl = plRef->val;
if (props == NULL)
{
// Write props only if we have at least one placement.
props = extractProps(top, lm);
writeDetails(props, dyScratch, outStreams->details);
writeMerged(top, outStreams->merged, lm);
}
bds = parsePlacement(pl, props, multiMapper, dyScratch, lm);
bds->chrom = getChrom(bds->chrom, ap->name);
bigDbSnpTabOut(bds, ap->outF);
// Prevent stale bds in errCatch below:
bds = NULL;
}
}
errCatchEnd(errCatch);
if (errCatch->gotError)
{
dumpLineToFile(line, outStreams->failJson);
char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
char *seqId = bds ? bds->chrom : "NOSEQ";
fprintf(outStreams->err, "%s\t%s\t%s", rsId, seqId, errCatch->message->string);
}
else if (isNotEmpty(errCatch->message->string))
warn("%s", errCatch->message->string);
errCatchFree(&errCatch);
}
dyStringFree(&dyScratch);
lmCleanup(&lm);
}
static struct slName *initFreqSourceOrder()
/* If -freqSourceOrder option was given, extract source names from the comma-sep list
* into globals freqSourceCount and freqSourceOrder[]. */
{
char *freqSourceOrderStr = optionVal("freqSourceOrder", NULL);
struct slName *sources = NULL;
if (freqSourceOrderStr)
{
sources = slNameListFromComma(freqSourceOrderStr);
freqSourceCount = slCount(sources);
AllocArray(freqSourceOrder, freqSourceCount);
struct slName *source;
int i;
for (i = 0, source = sources; i < freqSourceCount; i++, source = source->next)
freqSourceOrder[i] = source->name;
}
return sources;
}
static struct perAssembly *getAssemblyProps(char *assemblyList, char *outRoot)
/* Split comma-sep assemblyList and make placement path and bigDbSnp output file handle for each. */
{
struct perAssembly *assemblyProps = NULL;
struct slName *assembly, *assemblies = slNameListFromComma(assemblyList);
for (assembly = assemblies; assembly != NULL; assembly = assembly->next)
{
struct perAssembly *ap;
AllocVar(ap);
ap->name = cloneString(assembly->name);
// Fancy path to get all placements that are on the desired assembly.
struct dyString *dy = dyStringCreate("primary_snapshot_data.placements_with_allele"
"[placement_annot.seq_id_traits_by_assembly[*].assembly_name=%s]",
assembly->name);
ap->placementPath = dyStringCannibalize(&dy);
dy = dyStringCreate("placement_annot.seq_id_traits_by_assembly[assembly_name=%s].is_top_level",
assembly->name);
ap->plIsTopLevelPath = dyStringCannibalize(&dy);
char prefix[2048];
safef(prefix, sizeof prefix, "%s.%s", outRoot, assembly->name);
ap->outF = openOutFile("%s.bigDbSnp", prefix);
slAddHead(&assemblyProps, ap);
}
return assemblyProps;
}
static void parseRefSeqToUcsc(char *refSeqToUcsc, struct hash *ncGrcToChrom,
struct hash *ncToTwoBitChrom)
/* Extract mappings of {NC_*, GRC*} to chrom and NC_* to {twoBitFile, chrom}. */
{
struct lineFile *lf = lineFileOpen(refSeqToUcsc, TRUE);
char *line;
while (lineFileNextReal(lf, &line))
{
char *row[5];
int wordCount = chopTabs(line, row);
lineFileExpectWords(lf, 4, wordCount);
char *acc = row[0];
char *assembly = row[1];
char *twoBitFile = row[2];
char *chrom = row[3];
if (!regexMatch(acc, "^N[A-Z]_[0-9]+\\.[0-9]+$"))
lineFileAbort(lf, "'%s' does not look like a RefSeq sequence accession "
"(expecting N*_number.number in first column)", acc);
if (!startsWith("GRCh", assembly))
lineFileAbort(lf, "'%s' does not look like a GRCh* assembly name "
"(expecting GRCh* in second column", assembly);
if (!endsWith(twoBitFile, ".2bit"))
lineFileAbort(lf, "'%s' does not look like a twoBit file "
"(expecting *.2bit in third column)", twoBitFile);
if (!regexMatch(chrom, "^chr[0-9A-Za-z_]+$"))
lineFileAbort(lf, "'%s' does not look like an hg* chrom name "
"(expecting 'chr' + alphanumeric/_ in fourth column)", chrom);
char accAssembly[strlen(acc) + 1 + strlen(assembly) + 1];
safef(accAssembly, sizeof accAssembly, "%s %s", acc, assembly);
hashAdd(ncGrcToChrom, accAssembly, cloneString(chrom));
char twoBitChrom[strlen(twoBitFile) + 1 + strlen(chrom) + 1];
safef(twoBitChrom, sizeof twoBitChrom, "%s:%s", twoBitFile, chrom);
hashAdd(ncToTwoBitChrom, acc, cloneString(twoBitChrom));
}
lineFileClose(&lf);
}
void dbSnpJsonToTab(char *assemblyList, char *refSeqToUcsc, char *jsonFile, char *outRoot)
/* dbSnpJsonToTab - Extract fields from dbSNP JSON files, write out as tab-separated BED+. */
{
struct slName *sources = initFreqSourceOrder();
struct lineFile *lf = lineFileOpen(jsonFile, TRUE);
struct outStreams *outStreams = outStreamsOpen(outRoot);
struct perAssembly *assemblyProps = getAssemblyProps(assemblyList, outRoot);
seqIsRc = hashNew(0);
ncGrcToChrom = hashNew(0);
ncToTwoBitChrom = hashNew(0);
ncToSeqWin = hashNew(0);
parseRefSeqToUcsc(refSeqToUcsc, ncGrcToChrom, ncToTwoBitChrom);
char *line = NULL;
while (lineFileNext(lf, &line, NULL))
parseDbSnpJson(line, assemblyProps, outStreams);
lineFileClose(&lf);
perAssemblyFreeList(&assemblyProps);
outStreamsClose(&outStreams);
// Free these up for squeaky-clean valgrind --leak-check
// [well, except for parseOptions and errCatch getStack/getThreadVars, but whatever]
hashFree(&seqIsRc);
hashFree(&ncGrcToChrom);
hashFree(&ncToTwoBitChrom);
hashFree(&ncToSeqWin);
slNameFreeList(&sources);
}
int main(int argc, char *argv[])
/* Process command line. */
{
optionInit(&argc, argv, options);
if (argc != 5)
usage();
dbSnpJsonToTab(argv[1], argv[2], argv[3], argv[4]);
return 0;
}