e9401d3358499d9ffa04eb076dfe18a12769d959
galt
  Tue Mar 28 16:43:15 2023 -0700
fixing dbSnpJsonToTab part of the dbSnp pipeline, our code was not handling clinvar records that had multiple significances, even though the json clearly shows it can be a list. Also the jsonQuery code is great at fetching a list of elements that match your query, but it just sticks them all together, and has no idea which came from which so maintaining the grouping and order was tricky. Used Angies suggestion to html-encode the commas so the rest of the code just things it is a single string without any commas in it so it can work properly inside the comma-separated list. It also reports in Warnings.tab the comma-separated ones re-encoded so you can double check those RS ids worked correctly in the browser, it also lists the multiple-sigs list for easy viewing for the pipeline-runner. fixes #30617

diff --git src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
index 4d66933..2d051b0 100644
--- src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
+++ src/hg/snp/dbSnpJsonToTab/dbSnpJsonToTab.c
@@ -935,30 +935,32 @@
         for (sIx = 0;  sIx < props->freqSourceCount;  sIx++)
             {
             if (isfinite(props->freqSourceMaf[sIx]))
                 gotFreq = TRUE;
             }
         if (!gotFreq)
             {
             props->freqSourceCount = 0;
             props->freqSourceMajorAl = NULL;
             props->freqSourceMinorAl = NULL;
             }
         }
     }
 }
 
+static void dyStringPrintSlNameList(struct dyString *dy, struct slName *list, char *sep); // forward declaration
+
 static struct sharedProps *extractProps(struct jsonElement *top, struct lm *lm)
 /* Extract the properties shared by all mappings of a refsnp from JSON, alloc & return. */
 {
 struct sharedProps *props = NULL;
 lmAllocVar(lm, props);
 char *rsNumber = jsonQueryString(top, "top", "refsnp_id", lm);
 char rsName[strlen(rsNumber)+3];
 safef(rsName, sizeof rsName, "rs%s", rsNumber);
 props->name = lmCloneString(lm, rsName);
 char *varType = jsonQueryString(top, "top", "primary_snapshot_data.variant_type", lm);
 props->class = varTypeToClass(varType);
 struct slRef *annotationsRef = jsonQueryElement(top, "top",
                                                  "primary_snapshot_data.allele_annotations[*]", lm);
 parseFrequencies(annotationsRef, props, rsName, lm);
 spdiNormalizeFreq(props, ncToTwoBitChrom, ncToSeqWin, lm);
@@ -969,30 +971,89 @@
 soTermNames = slCat(soTermNames,
                     jsonQueryStringList(annotationsRef, "annotations",
                                         "assembly_annotation[*].genes[*].rnas[*]"
                                         ".protein.sequence_ontology[*].accession", lm));
 props->soTerms = soTermStringIdToIdList(soTermNames, lm);
 props->maxFuncImpact = props->soTerms ? props->soTerms->val : soUnknown;
 props->submitters = jsonQueryStrings(top, "top",
                                     "primary_snapshot_data.support[id.type=subsnp].submitter_handle",
                                         lm);
 slUniqify(&props->submitters, slNameCmp, NULL);
 props->pubMedIds = jsonQueryInts(top, "top", "citations[*]", lm);
 props->clinVarAccs = jsonQueryStringList(annotationsRef, "annotations",
 				    "clinical[*].accession_version", lm);
 props->clinVarSigs = jsonQueryStringList(annotationsRef, "annotations",
                                           "clinical[*].clinical_significances[*]", lm);
+if (slCount(props->clinVarAccs) != slCount(props->clinVarSigs))  // extra steps to deal with rare cases of elements with multiple significances listed.
+    {
+    struct dyString *dy = dyStringNew(256);
+    dyStringClear(dy);
+    dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
+    char *origClinVarSigs = cloneString(dy->string);
+
+    props->clinVarSigs = NULL;
+
+    for (struct slRef *anno=annotationsRef; anno; anno=anno->next)
+	{
+        // to achieve the needed effect, step through the top-level annotationsRef one element at a time, instead of as a list.
+	struct slRef *annoAnnoNext = anno->next;
+	anno->next = NULL;  // suppress the siblings temporarily so we process each element one at a time.
+	for (int c = 0; ; ++c)
+	    {
+	    char queryString[256];
+	    safef(queryString, sizeof queryString, "clinical[%d].clinical_significances[*]", c);
+	    struct slName *theseSigNames = jsonQueryStringList(anno, "annotations", queryString, lm);
+	    if (!theseSigNames)
+		{
+		break;
+		}
+            // re-encode the separation of multiple values with a html encoding instead of a comma
+	    // so they will not mess up the comma-separated list that acts like an array.
+            if (slCount(theseSigNames) >=2)
+		{
+                dyStringClear(dy);
+		for(struct slName *slTemp = theseSigNames; slTemp; slTemp=slTemp->next)
+		    {	
+		    dyStringPrintf(dy, "%s", slTemp->name);
+		    if (slTemp->next)
+			{
+			dyStringPrintf(dy,  "&#44;");
+			}
+		    }
+		slAddHead(&props->clinVarSigs, slNameNew(dy->string));
+                dyStringClear(dy);
+		dyStringPrintSlNameList(dy, theseSigNames, ",");
+		warn("comma separator html-encoded in %s which has multiple clinical_significances [%s]", rsName, dy->string);
+		}
+	    else  // just one on the list
+		{
+		slAddHead(&props->clinVarSigs, theseSigNames);
+		}
+	    }
+	anno->next = annoAnnoNext;  // restore the pointer
+	}
+    slReverse(&props->clinVarSigs);
+    // recreate the error to re-create the original to confirm correct result, with all fields in the same order.
+    dyStringClear(dy);
+    dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
+    char *badReconstruction = replaceChars(dy->string, "&#44;", ",");
+    assert(sameString(origClinVarSigs, badReconstruction));
+    freeMem(badReconstruction);
+    freeMem(origClinVarSigs);
+    dyStringFree(&dy);
+    }
+
 return props;
 }
 
 static void setBdsFreqData(struct sharedProps *props, boolean isRc, struct bigDbSnp *bds,
                            struct dyString *dyUcscNotes, struct lm *lm)
 /* Use isRc, props to fill in bds->majorAllele, ->minorAllele, determine whether ref is maj/min.
  * Add notes to dyUcscNotes if applicable. */
 {
 if (props->freqSourceCount > 0)
     {
     bds->freqSourceCount = props->freqSourceCount;
     lmAllocArray(lm, bds->majorAllele, props->freqSourceCount);
     lmAllocArray(lm, bds->minorAllele, props->freqSourceCount);
     lmAllocArray(lm, bds->minorAlleleFreq, props->freqSourceCount);
     char *firstMajorAllele = NULL;
@@ -1681,30 +1742,33 @@
     }
 }
 
 static void writeDetails(struct sharedProps *props, struct dyString *dy, FILE *outPropsF)
 /* Write props out as tab-sep line to outPropsF. */
 {
 dyStringClear(dy);
 dyStringPrintf(dy, "%s\t", props->name);
 appendFrequencies(props, dy);
 dyStringPrintf(dy, "\t%d\t", slCount(props->soTerms));
 dyStringPrintSlIntList(dy, props->soTerms, ",");
 dyStringPrintf(dy, "\t%d\t", slCount(props->clinVarAccs));
 dyStringPrintSlNameList(dy, props->clinVarAccs, ",");
 dyStringAppendC(dy, '\t');
 dyStringPrintSlNameList(dy, props->clinVarSigs, ",");
+
+assert(slCount(props->clinVarAccs) == slCount(props->clinVarSigs));
+
 dyStringPrintf(dy, "\t%d\t", slCount(props->submitters));
 dyStringPrintSlNameList(dy, props->submitters, ",");
 dyStringPrintf(dy, "\t%d\t", slCount(props->pubMedIds));
 dyStringPrintSlIntList(dy, props->pubMedIds, ",");
 dyStringAppendC(dy, '\n');
 fputs(dy->string, outPropsF);
 }
 
 static void updateSeqIsRc(struct slRef *placements, struct lm *lm)
 /* Note is_aln_opposite_orientation flag for each mapped sequence, regardless of whether it's
  * in the assembly that we're working on; later we can look up sequences from freq reports. */
 {
 hashIntReset(seqIsRc);
 struct slRef *plRef;
 for (plRef = placements;  plRef != NULL;  plRef = plRef->next)
@@ -1860,38 +1924,40 @@
             bds = parsePlacement(pl, props, multiMapper, dyScratch, lm);
             if (bds)
                 {
                 bds->chrom = getChrom(bds->chrom, ap->name);
                 bigDbSnpTabOut(bds, ap->outF);
                 // Prevent stale bds in errCatch below:
                 bds = NULL;
                 }
             else
                 {
                 writeBadCoords(pl, props->name, ap->name, ap->outBad, lm);
                 }
             }
         }
     errCatchEnd(errCatch);
-    if (errCatch->gotError)
-        {
     char *rsId = jsonQueryString(top, "top", "refsnp_id", lm);
     char *seqId = bds ? bds->chrom : "NOSEQ";
+    if (errCatch->gotError)
+        {
         fprintf(outStreams->err, "%s\t%s\t%s", rsId, seqId, errCatch->message->string);
         }
     else if (isNotEmpty(errCatch->message->string))
-        fprintf(outStreams->warn, "%s", errCatch->message->string);
+	{
+        fprintf(outStreams->warn, "%s\t%s\t%s", rsId, seqId, errCatch->message->string);
+        }
     errCatchFree(&errCatch);
     }
 dyStringFree(&dyScratch);
 lmCleanup(&lm);
 }
 
 static struct slName *initFreqSourceOrder()
 /* If -freqSourceOrder option was given, extract source names from the comma-sep list
  * into globals freqSourceCount and freqSourceOrder[]. */
 {
 char *freqSourceOrderStr = optionVal("freqSourceOrder", NULL);
 struct slName *sources = NULL;
 if (freqSourceOrderStr)
     {
     sources = slNameListFromComma(freqSourceOrderStr);