726a4c9c82592c662ae6b8c715e67b783c95beb3 angie Mon Nov 18 13:45:42 2019 -0800 Instead of dropping rs IDs that have incomplete frequency data, add two new ucscNotes: freqIncomplete and freqNotMapped. refs #23283 diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c index e42d34c..cb46c1d 100644 --- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c +++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c @@ -144,30 +144,32 @@ struct alObs **freqSourceObs; // Array of per-project observation lists char **freqSourceMajorAl; // Array of per-project major allele char **freqSourceMinorAl; // Array of per-project minor allele double *freqSourceMaf; // Array of per-project minor allele frequencies int biggestSourceIx; // index of freqSourceObs with highest totalCount boolean *freqIsRc; // Per-project flag for sequence isRc relative to ptlp struct slInt *soTerms; // Sequence Ontology functional effect terms struct slName *submitters; // dbSNP submitter handles struct slInt *pubMedIds; // PubMed article citations struct slName *clinVarAccs; // Accessions of ClinVar annotations, i.e. ClinVar RCV* struct slName *clinVarSigs; // Reported ClinVar significance enum bigDbSnpClass class; // {snv, mnv, ins, del, delins, identity} enum soTerm maxFuncImpact; // {codingNonSyn, splice, codingSyn, intron, upstream, downstream, intergenic} int commonCount; // Number of projects reporting minor allele freq >= 1% int rareCount; // Number of projects reporting minor allele freq < 1% + boolean freqIncomplete; // At least one project provided no alt allele (only del==ins) + boolean freqNotMapped; // At least one project's report was not mapped to own asmbly }; struct spdiBed *spdiBedNewLm(char *seq, int pos, char *del, char *ins, struct lm *lm) /* Assume seq, del, and ins belong to lm and don't need to be cloned; alloc spdi in lm. * Checking validity of alleles is up to caller, but this does check whether del is a number * instead of sequence. */ { struct spdiBed *spdi; lmAllocVar(lm, spdi); spdi->chrom = seq; spdi->chromStart = pos; spdi->del = del; spdi->ins = ins; if (isAllDigits(del)) spdi->chromEnd = pos + atoi(del); @@ -412,62 +414,78 @@ lmAllocArray(lm, props->freqSourceSpdis, freqSourceCount); lmAllocArray(lm, props->freqSourceObs, freqSourceCount); lmAllocArray(lm, props->freqIsRc, freqSourceCount); } else { props->freqSourceCount = 0; int allocSize = slCount(obsList); lmAllocArray(lm, props->freqSourceSpdis, allocSize); lmAllocArray(lm, props->freqSourceObs, allocSize); lmAllocArray(lm, props->freqIsRc, allocSize); } struct slName *unorderedSourceList = NULL; int maxSampleCount = 0; int biggestSourceIx = 0; + int validCount = 0; struct slRef *obsRef; for (obsRef = obsList; obsRef != NULL; obsRef = obsRef->next) { struct jsonElement *obsEl = obsRef->val; char *source = NULL; struct spdiBed *spdiB = NULL; struct alObs *obs = parseAlObs(obsEl, &source, &spdiB, lm); int ix = freqSourceToIx(source, &unorderedSourceList, lm); + props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1); + if (props->freqIsRc[ix] >= 0) + { slAddHead(&props->freqSourceSpdis[ix], spdiB); slAddHead(&props->freqSourceObs[ix], obs); - props->freqIsRc[ix] = hashIntValDefault(seqIsRc, spdiB->chrom, -1); - if (props->freqIsRc[ix] < 0) - errAbort("parseFrequencies: Unknown orientation for %s seq_id %s", - rsId, spdiB->chrom); if (obs->totalCount > maxSampleCount) { maxSampleCount = obs->totalCount; biggestSourceIx = ix; } + validCount++; } + else + props->freqNotMapped = TRUE; + } + if (props->freqNotMapped) + warn("Frequency report not mapped to own assembly for %s", props->name); + if (validCount > 0) + { if (!freqSourceOrder) props->freqSourceCount = slCount(unorderedSourceList); int ix; for (ix = 0; ix < props->freqSourceCount; ix++) { slReverse(&props->freqSourceSpdis[ix]); slReverse(&props->freqSourceObs[ix]); } addMissingRefAllele(props, rsId, lm); checkFreqSourceObs(props, rsId); props->biggestSourceIx = biggestSourceIx; } + else + { + props->freqSourceCount = 0; + props->freqSourceSpdis = NULL; + props->freqSourceObs = NULL; + props->freqIsRc = NULL; + } + } } static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm) /* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional * impact, discard duplicates and return as slInt list with highest impact first. */ { struct slInt *intList = NULL; if (soTermNames) { int termCount = slCount(soTermNames); enum soTerm termArray[termCount]; struct slName *term; int i; for (i = 0, term = soTermNames; term != NULL; i++, term = term->next) termArray[i] = soTermStringIdToId(term->name); @@ -807,33 +825,44 @@ minorAllele = majorAllele; minorAlleleCount = majorAlleleCount; majorAlleleCount = obs->obsCount; majorAllele = obs->allele; } else if (obs->obsCount > minorAlleleCount) { minorAllele = obs->allele; minorAlleleCount = obs->obsCount; } totalCount = obs->totalCount; } if (totalCount > 0) { if (majorAlleleCount < 0) - errAbort("no major allele observed despite nonzero totalCount...?"); + errAbort("no allele observed despite nonzero totalCount...?"); if (minorAlleleCount < 0) - errAbort("no minor allele observed... what should the minorAllele value be??"); + { + // Allele and count are provided for only one allele -- inferring the other allele + // would be dicey, so just add a note and erase the freq data from this source and + // return NaN. + props->freqIncomplete = TRUE; + props->freqSourceSpdis[sourceIx] = NULL; + props->freqSourceObs[sourceIx] = NULL; + *retMajorAllele = NULL; + *retMinorAllele = NULL; + warn("Incomplete freq data from source %d for %s", sourceIx, props->name); + return NAN; + } } *retMajorAllele = lmCloneString(lm, majorAllele); *retMinorAllele = lmCloneString(lm, minorAllele); // If no data from source, divide by zero will return -nan or -inf. (Use isfinite() to test) return (double)minorAlleleCount / totalCount; } static void sortFrequencies(struct sharedProps *props, struct lm *lm) /* Determine major/minor allele and per-project MAF (if given). * If -freqOrder option was given, then fill in props->freqSource* arrays in that order. * Otherwise, choose the MAF from the project with the largest sample count as a fallback. */ { if (props->freqSourceCount > 0) { if (freqSourceOrder != NULL) @@ -862,30 +891,48 @@ props->freqSourceCount = 1; lmAllocArray(lm, props->freqSourceMajorAl, props->freqSourceCount); lmAllocArray(lm, props->freqSourceMinorAl, props->freqSourceCount); lmAllocArray(lm, props->freqSourceMaf, props->freqSourceCount); props->freqSourceMaf[0] = mafFromSource(props, props->biggestSourceIx, &props->freqSourceMajorAl[0], &props->freqSourceMinorAl[0], lm); if (isfinite(props->freqSourceMaf[0])) { if (props->freqSourceMaf[0] >= 0.01) props->commonCount++; else props->rareCount++; } } + if (props->freqIncomplete) + { + // At least one freq project was erased due to incomplete data. If it was the only + // project with any data, set props->freqSourceCount to 0 (no freq data). + int sIx; + boolean gotFreq = FALSE; + for (sIx = 0; sIx < props->freqSourceCount; sIx++) + { + if (isfinite(props->freqSourceMaf[sIx])) + gotFreq = TRUE; + } + if (!gotFreq) + { + props->freqSourceCount = 0; + props->freqSourceMajorAl = NULL; + props->freqSourceMinorAl = NULL; + } + } } } static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm) /* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */ { struct sharedProps *props = NULL; lmAllocVar(lm, props); char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm); char rsName[strlen(rsNumber)+3]; safef(rsName, sizeof rsName, "rs%s", rsNumber); props->name = lmCloneString(lm, rsName); char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm); props->class = varTypeToClass(varType); struct slRef *annotationsRef = jsonQueryElement(top, "top", @@ -1198,30 +1245,34 @@ dyStringAppend(dyUcscNotes, bdsClinvar ","); addClinVarSigs(dyUcscNotes, props); if (props->commonCount > 0) { dyStringAppend(dyUcscNotes, bdsCommonSome ","); if (props->rareCount == 0) dyStringAppend(dyUcscNotes, bdsCommonAll ","); else dyStringAppend(dyUcscNotes, bdsRareSome ","); } else if (props->rareCount > 0 || props->freqSourceCount == 0) { dyStringAppend(dyUcscNotes, bdsRareSome ","); dyStringAppend(dyUcscNotes, bdsRareAll ","); } +if (props->freqIncomplete) + dyStringAppend(dyUcscNotes, bdsFreqIncomplete ","); +if (props->freqNotMapped) + dyStringAppend(dyUcscNotes, bdsFreqNotMapped ","); if (isRc) dyStringAppend(dyUcscNotes, bdsRevStrand ","); if (isMultiMapper) dyStringAppend(dyUcscNotes, bdsMultiMap ","); checkRareRef(props, bds->ref, isRc, dyUcscNotes); if (delMismatchesGenome(bds, seqWin)) dyStringAppend(dyUcscNotes, bdsRefMismatch ","); if (anyIupac(bds->ref)) dyStringAppend(dyUcscNotes, bdsRefIsAmbiguous ","); int i; for (i = 0; i < bds->altCount; i++) if (anyIupac(bds->alts[i])) dyStringAppend(dyUcscNotes, bdsAltIsAmbiguous ","); if (hasAmbiguousFreqAllele(props)) dyStringAppend(dyUcscNotes, bdsFreqIsAmbiguous ",");