69087d3a65af31c39337085920e99e5b2db13082 galt Fri Jun 17 15:05:28 2022 -0700 Ran the dbsnp pipeline designed by Angie for dbsnp v155. It produces huge bigBed output and I found and fixed a problem encountered on the bedToBigBed. I also tweaked dbSnpJsonToTab to deal with some dbsnp data having multiple study subversions, by ignoring the old datasets and just using the latest one. Added a track description page that has lots of content and counts to update. dbsnp155 is ready for QA on hgwdev. refs #rm27751 diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c index 06d244d..4d66933 100644 --- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c +++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c @@ -120,30 +120,31 @@ struct spdiBed *next; char *chrom; // Sequence ID uint chromStart; // Position: 0-based start uint chromEnd; // Position + number of deleted bases char *del; // Deleted sequence (starting at pos) -- or number of deleted bases char *ins; // Inserted sequence }; struct alObs // Unit of variant allele frequency observation { struct alObs *next; char *allele; // Variant allele sequence int obsCount; // Number of chromosomes on which allele was observed by a project int totalCount; // Number of chromosomes on which project observed some allele of this variant + int studyVersion; // Version of this study, there can be multiple for each allele. }; struct sharedProps /* Properties of a refsnp that are shared by all mappings to all assemblies. */ { // Extracted from JSON: char *name; // rs# ID int freqSourceCount; // Number of projects reporting allele frequency observations struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs struct alObs **freqSourceObs; // Array of per-project observation lists char **freqSourceMajorAl; // Array of per-project major allele char **freqSourceMinorAl; // Array of per-project minor allele double *freqSourceMaf; // Array of per-project minor allele frequencies int biggestSourceIx; // index of freqSourceObs with highest totalCount boolean *freqIsRc; // Per-project flag for sequence isRc relative to ptlp @@ -313,30 +314,31 @@ static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource, struct spdiBed **retSpdiB, struct lm *lm) /* Parse one allele observation. */ { struct alObs *obs = NULL; lmAllocVar(lm, obs); char *source = jsonQueryString(obsEl, "alObs", "study_name", lm); if (source == NULL) errAbort("parseAlObs: Missing or null study_name"); struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm); struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm); obs->allele = spdiB->ins; obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm); obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm); +obs->studyVersion = jsonQueryInt(obsEl, "alObs", "study_version", -1, lm); if (obs->obsCount < 0) errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)", source, obs->allele); if (obs->totalCount < 0) errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)", source, obs->allele); if (obs->totalCount == 0) errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)", source, obs->allele); *retSource = source; *retSpdiB = spdiB; return obs; } static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm) @@ -371,30 +373,81 @@ errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref " "counts %d", rsId, ix, firstObs->totalCount, sumCounts); struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart, firstSpdiB->del, firstSpdiB->del, lm); struct alObs *obs; lmAllocVar(lm, obs); obs->allele = spdiB->del; obs->obsCount = firstObs->totalCount - sumCounts; obs->totalCount = firstObs->totalCount; slAddHead(&props->freqSourceSpdis[ix], spdiB); slAddHead(&props->freqSourceObs[ix], obs); } } } +static void stripOldStudyVersions(struct sharedProps *props, char *rsId) +/* After frequency allele observations have been sorted into per-project lists, + * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent. + * The sum of obsCounts may be less than total_count (no-calls), but not greater. */ +{ +int ix; +for (ix = 0; ix < props->freqSourceCount; ix++) + { + int maxStudyVersion = 0; + boolean multipleVersions = FALSE; + struct alObs *obsList = props->freqSourceObs[ix], *obs; + for (obs = obsList; obs != NULL; obs = obs->next) + { + if (obs == obsList) + { + maxStudyVersion = obs->studyVersion; + } + else + { + if (obs->studyVersion > maxStudyVersion) + { + maxStudyVersion = obs->studyVersion; + multipleVersions = TRUE; + } + else if (obs->studyVersion != maxStudyVersion) + { + multipleVersions = TRUE; + } + } + } + if (multipleVersions) + { + struct alObs *newObsList = NULL; + struct alObs *nextObs = NULL; + struct spdiBed *spdiBList = props->freqSourceSpdis[ix], *spdiB, *nextSpdiB, *newSpdiBList = NULL; + for (obs = obsList, spdiB = spdiBList; obs != NULL; obs = nextObs, spdiB = nextSpdiB) + { + nextObs = obs->next; + nextSpdiB = spdiB->next; + if (obs->studyVersion == maxStudyVersion) + { + slAddHead(&newObsList, obs); + slAddHead(&newSpdiBList, spdiB); + } + } + props->freqSourceObs[ix] = newObsList; + props->freqSourceSpdis[ix] = newSpdiBList; + } + } +} + static void checkFreqSourceObs(struct sharedProps *props, char *rsId) /* After frequency allele observations have been sorted into per-project lists, * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent. * The sum of obsCounts may be less than total_count (no-calls), but not greater. */ { int ix; for (ix = 0; ix < props->freqSourceCount; ix++) { int totalCount = 0; struct alObs *obsList = props->freqSourceObs[ix], *obs; for (obs = obsList; obs != NULL; obs = obs->next) { if (! seqIsNucleotide(obs->allele)) errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId); if (obs->obsCount < 0) @@ -468,30 +521,31 @@ props->freqNotMapped = TRUE; } if (props->freqNotMapped) warn("Frequency report not mapped to own assembly for %s", props->name); if (validCount > 0) { if (!freqSourceOrder) props->freqSourceCount = slCount(unorderedSourceList); int ix; for (ix = 0; ix < props->freqSourceCount; ix++) { slReverse(&props->freqSourceSpdis[ix]); slReverse(&props->freqSourceObs[ix]); } addMissingRefAllele(props, rsId, lm); + stripOldStudyVersions(props, rsId); checkFreqSourceObs(props, rsId); props->biggestSourceIx = biggestSourceIx; } else { props->freqSourceCount = 0; props->freqSourceSpdis = NULL; props->freqSourceObs = NULL; props->freqIsRc = NULL; } } } static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm) /* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional