69087d3a65af31c39337085920e99e5b2db13082
galt
  Fri Jun 17 15:05:28 2022 -0700
Ran the dbsnp pipeline designed by Angie for dbsnp v155. It produces huge bigBed output and I found and fixed a problem encountered on the bedToBigBed. I also tweaked dbSnpJsonToTab to deal with some dbsnp data having multiple study subversions, by ignoring the old datasets and just using the latest one. Added a track description page that has lots of content and counts to update. dbsnp155 is ready for QA on hgwdev. refs #rm27751

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index 06d244d..4d66933 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -120,30 +120,31 @@
     struct spdiBed *next;
     char *chrom;           // Sequence ID
     uint chromStart;       // Position: 0-based start
     uint chromEnd;         // Position + number of deleted bases
     char *del;             // Deleted sequence (starting at pos) -- or number of deleted bases
     char *ins;             // Inserted sequence
 };
 
 struct alObs
 // Unit of variant allele frequency observation
     {
     struct alObs *next;
     char *allele;   // Variant allele sequence
     int obsCount;   // Number of chromosomes on which allele was observed by a project
     int totalCount; // Number of chromosomes on which project observed some allele of this variant
+    int studyVersion; // Version of this study, there can be multiple for each allele.
     };
 
 struct sharedProps
 /* Properties of a refsnp that are shared by all mappings to all assemblies. */
     {
     // Extracted from JSON:
     char *name;                       // rs# ID
     int freqSourceCount;              // Number of projects reporting allele frequency observations
     struct spdiBed **freqSourceSpdis; // Array of per-project frequency allele SPDIs
     struct alObs **freqSourceObs;     // Array of per-project observation lists
     char **freqSourceMajorAl;         // Array of per-project major allele
     char **freqSourceMinorAl;         // Array of per-project minor allele
     double *freqSourceMaf;            // Array of per-project minor allele frequencies
     int biggestSourceIx;              // index of freqSourceObs with highest totalCount
     boolean *freqIsRc;                // Per-project flag for sequence isRc relative to ptlp
@@ -313,30 +314,31 @@
 
 static struct alObs *parseAlObs(struct jsonElement *obsEl, char **retSource,
                                 struct spdiBed **retSpdiB, struct lm *lm)
 /* Parse one allele observation. */
 {
 struct alObs *obs = NULL;
 lmAllocVar(lm, obs);
 char *source = jsonQueryString(obsEl, "alObs", "study_name", lm);
 if (source == NULL)
     errAbort("parseAlObs: Missing or null study_name");
 struct slRef *spdiRef = jsonQueryElement(obsEl, "alObs", "observation", lm);
 struct spdiBed *spdiB = parseSpdis(spdiRef, source, lm);
 obs->allele = spdiB->ins;
 obs->obsCount = jsonQueryInt(obsEl, "alObs", "allele_count", -1, lm);
 obs->totalCount = jsonQueryInt(obsEl, "alObs", "total_count", -1, lm);
+obs->studyVersion = jsonQueryInt(obsEl, "alObs", "study_version", -1, lm);
 if (obs->obsCount < 0)
     errAbort("parseAlObs: allele_count not reported for %s (ins_seq %s)",
              source, obs->allele);
 if (obs->totalCount < 0)
     errAbort("parseAlObs: total_count not reported for %s (ins_seq %s)",
              source, obs->allele);
 if (obs->totalCount == 0)
     errAbort("parseAlObs: got a total_count of 0 for %s (ins_seq %s)",
              source, obs->allele);
 *retSource = source;
 *retSpdiB = spdiB;
 return obs;
 }
 
 static void addMissingRefAllele(struct sharedProps *props, char *rsId, struct lm *lm)
@@ -371,30 +373,81 @@
             errAbort("addMissingRefAllele: %s, source %d, total_count is %d < sum of non-ref "
                      "counts %d", rsId, ix, firstObs->totalCount, sumCounts);
         struct spdiBed *spdiB = spdiBedNewLm(firstSpdiB->chrom, firstSpdiB->chromStart,
                                              firstSpdiB->del, firstSpdiB->del, lm);
         struct alObs *obs;
         lmAllocVar(lm, obs);
         obs->allele = spdiB->del;
         obs->obsCount = firstObs->totalCount - sumCounts;
         obs->totalCount = firstObs->totalCount;
         slAddHead(&props->freqSourceSpdis[ix], spdiB);
         slAddHead(&props->freqSourceObs[ix], obs);
         }
     }
 }
 
+static void stripOldStudyVersions(struct sharedProps *props, char *rsId)
+/* After frequency allele observations have been sorted into per-project lists,
+ * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent.
+ * The sum of obsCounts may be less than total_count (no-calls), but not greater. */
+{
+int ix;
+for (ix = 0;  ix < props->freqSourceCount;  ix++)
+    {
+    int maxStudyVersion = 0;
+    boolean multipleVersions = FALSE;
+    struct alObs *obsList = props->freqSourceObs[ix], *obs;
+    for (obs = obsList;  obs != NULL;  obs = obs->next)
+        {
+	if (obs == obsList)
+	    {
+	    maxStudyVersion = obs->studyVersion;
+	    }
+        else
+	    {
+	    if (obs->studyVersion > maxStudyVersion)
+		{
+		maxStudyVersion = obs->studyVersion;
+		multipleVersions = TRUE;
+		}
+	    else if (obs->studyVersion != maxStudyVersion)
+		{
+		multipleVersions = TRUE;
+		}
+	    }
+        }
+    if (multipleVersions)
+	{
+	struct alObs *newObsList = NULL;
+	struct alObs *nextObs = NULL;
+	struct spdiBed *spdiBList = props->freqSourceSpdis[ix], *spdiB, *nextSpdiB, *newSpdiBList = NULL;
+        for (obs = obsList, spdiB = spdiBList;  obs != NULL; obs = nextObs, spdiB = nextSpdiB)
+	    {
+	    nextObs = obs->next;
+	    nextSpdiB = spdiB->next;
+	    if (obs->studyVersion == maxStudyVersion)
+		{
+		slAddHead(&newObsList, obs);
+		slAddHead(&newSpdiBList, spdiB);
+		}
+	    }
+	props->freqSourceObs[ix]   = newObsList;
+	props->freqSourceSpdis[ix] = newSpdiBList;
+	}
+    }
+}
+
 static void checkFreqSourceObs(struct sharedProps *props, char *rsId)
 /* After frequency allele observations have been sorted into per-project lists,
  * make sure the alleles look like [ACGT]+ and the reported total_counts are consistent.
  * The sum of obsCounts may be less than total_count (no-calls), but not greater. */
 {
 int ix;
 for (ix = 0;  ix < props->freqSourceCount;  ix++)
     {
     int totalCount = 0;
     struct alObs *obsList = props->freqSourceObs[ix], *obs;
     for (obs = obsList;  obs != NULL;  obs = obs->next)
         {
         if (! seqIsNucleotide(obs->allele))
             errAbort("checkFreqSourceObs: invalid observed allele '%s' for %s", obs->allele, rsId);
         if (obs->obsCount < 0)
@@ -468,30 +521,31 @@
             props->freqNotMapped = TRUE;
         }
     if (props->freqNotMapped)
         warn("Frequency report not mapped to own assembly for %s", props->name);
     if (validCount > 0)
         {
         if (!freqSourceOrder)
             props->freqSourceCount = slCount(unorderedSourceList);
         int ix;
         for (ix = 0;  ix < props->freqSourceCount; ix++)
             {
             slReverse(&props->freqSourceSpdis[ix]);
             slReverse(&props->freqSourceObs[ix]);
             }
         addMissingRefAllele(props, rsId, lm);
+        stripOldStudyVersions(props, rsId);
         checkFreqSourceObs(props, rsId);
         props->biggestSourceIx = biggestSourceIx;
         }
     else
         {
         props->freqSourceCount = 0;
         props->freqSourceSpdis = NULL;
         props->freqSourceObs = NULL;
         props->freqIsRc = NULL;
         }
     }
 }
 
 static struct slInt *soTermStringIdToIdList(struct slName *soTermNames, struct lm *lm)
 /* Given a list of strings like "SO:0001627", convert them to enum soTerm, sort by functional