fec4e5d98ee6a52aac4506f02dc28980f52c738f
hiram
  Sat Feb 3 16:29:45 2024 -0800
beginning to combine the outputs from asmSummary and genark tables refs #23589

diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c
index 452c37f..92c9b5c 100644
--- src/hg/hubApi/findGenome.c
+++ src/hg/hubApi/findGenome.c
@@ -1,282 +1,356 @@
 /* findGenome search functions */
 
 #include "dataApi.h"
 #include "hgFind.h"
 #include "cartTrackDb.h"
 #include "cartJson.h"
 #include "genark.h"
 #include "asmAlias.h"
 #include "asmSummary.h"
 
+struct combinedSummary
+/* may have information from any of:  asmSummary, genark or dbDb */
+    {
+    struct combinedSummary *next;	/* Next in singly linked list */
+    struct asmSummary *summary;		/* from asmSummary table */
+    struct genark *genArk;		/* from genark table */
+    struct dbDb *dbDb;			/* from dbDb table */
+    };
+
+/* will be initialized as this function begins */
+static char *genarkTable = NULL;
+
 static void startGenomes(struct jsonWrite *jw)
 /* begin the list output */
 {
 if (jsonOutputArrays)
     jsonWriteListStart(jw, "genomes");
 else
     jsonWriteObjectStart(jw, "genomes");
 }
 
 static void endGenomes(struct jsonWrite *jw)
 /* end the list output */
 {
 if (jsonOutputArrays)
     jsonWriteListEnd(jw);
 else
     jsonWriteObjectEnd(jw);
 }
 
-static int outputFullText(struct jsonWrite *jw, struct asmSummary *summaryList)
+static void finiGenArk(struct jsonWrite *jw, struct genark *el)
+/* output of a genark item has started, finish it off */
+{
+jsonWriteBoolean(jw, "isGenArkBrowser", TRUE);
+jsonWriteString(jw, "hubUrl", el->hubUrl);
+jsonWriteString(jw, "asmName", el->asmName);
+jsonWriteString(jw, "scientificName", el->scientificName);
+jsonWriteString(jw, "commonName", el->commonName);
+}
+
+static void finiSummary(struct jsonWrite *jw, struct asmSummary *el)
+/* output of a summary item has started, finish it off */
+{
+jsonWriteString(jw, "bioProject", el->bioproject);
+jsonWriteString(jw, "bioSample", el->biosample);
+jsonWriteString(jw, "wgsMaster", el->wgsMaster);
+jsonWriteString(jw, "refseqCategory", el->refseqCategory);
+jsonWriteNumber(jw, "taxId", (long long)el->taxId);
+jsonWriteNumber(jw, "speciesTaxid", (long long)el->speciesTaxid);
+jsonWriteString(jw, "organismName", el->organismName);
+jsonWriteString(jw, "infraspecificName", el->infraspecificName);
+jsonWriteString(jw, "isolate", el->isolate);
+jsonWriteString(jw, "versionStatus", el->versionStatus);
+jsonWriteString(jw, "assemblyLevel", el->assemblyLevel);
+jsonWriteString(jw, "releaseType", el->releaseType);
+jsonWriteString(jw, "genomeRep", el->genomeRep);
+jsonWriteString(jw, "seqRelDate", el->seqRelDate);
+jsonWriteString(jw, "asmName", el->asmName);
+jsonWriteString(jw, "asmSubmitter", el->asmSubmitter);
+jsonWriteString(jw, "gbrsPairedAsm", el->gbrsPairedAsm);
+jsonWriteString(jw, "pairedAsmComp", el->pairedAsmComp);
+jsonWriteString(jw, "ftpPath", el->ftpPath);
+jsonWriteString(jw, "excludedFromRefseq", el->excludedFromRefseq);
+jsonWriteString(jw, "relationToTypeMaterial", el->relationToTypeMaterial);
+jsonWriteString(jw, "assemblyType", el->assemblyType);
+jsonWriteString(jw, "phyloGroup", el->phyloGroup);
+jsonWriteNumber(jw, "genomeSize", (long long)el->genomeSize);
+jsonWriteNumber(jw, "genomeSizeUngapped", (long long)el->genomeSizeUngapped);
+jsonWriteDouble(jw, "gcPercent", (double)el->gcPercent);
+jsonWriteNumber(jw, "repliconCount", (long long)el->repliconCount);
+jsonWriteNumber(jw, "scaffoldCount", (long long)el->scaffoldCount);
+jsonWriteNumber(jw, "contigCount", (long long)el->contigCount);
+jsonWriteString(jw, "annotationProvider", el->annotationProvider);
+jsonWriteString(jw, "annotationName", el->annotationName);
+jsonWriteString(jw, "annotationDate", el->annotationDate);
+jsonWriteNumber(jw, "totalGeneCount", (long long)el->totalGeneCount);
+jsonWriteNumber(jw, "proteinCodingGeneCount", (long long)el->proteinCodingGeneCount);
+jsonWriteNumber(jw, "nonCodingGeneCount", (long long)el->nonCodingGeneCount);
+jsonWriteString(jw, "pubmedId", el->pubmedId);
+}
+
+static int outputCombo(struct jsonWrite *jw, struct combinedSummary *summaryList)
+/* may be information from any of the three tables */
 {
 int itemCount = 0;
-/* XXX TBD need to get common names for these items */
-struct asmSummary *el = NULL;
+struct combinedSummary *el = NULL;
 startGenomes(jw);
 for (el=summaryList; el != NULL; el=el->next)
     {
+    if (el->summary)
+	{
 	if (jsonOutputArrays)
 	    {
 	    jsonWriteObjectStart(jw, NULL);
-        jsonWriteString(jw, "accession", el->assemblyAccession);
-        jsonWriteString(jw, "asmName", el->asmName);
-        jsonWriteString(jw, "asmSubmitter", el->asmSubmitter);
-        jsonWriteString(jw, "annotationProvider", el->annotationProvider);
-        jsonWriteString(jw, "assemblyType", el->assemblyType);
-        jsonWriteString(jw, "bioProject", el->bioproject);
-        jsonWriteString(jw, "bioSample", el->biosample);
-        jsonWriteString(jw, "organismName", el->organismName);
-        jsonWriteString(jw, "isolate", el->isolate);
-        jsonWriteNumber(jw, "taxId", (long long)el->taxId);
-        jsonWriteObjectEnd(jw);
+	    jsonWriteString(jw, "accession", el->summary->assemblyAccession);
 	    }
 	else
+	    jsonWriteObjectStart(jw, el->summary->assemblyAccession);
+
+        finiSummary(jw, el->summary);
+	boolean sciNameDone = FALSE;
+	boolean comNameDone = FALSE;
+        if (el->genArk)
 	    {
-        jsonWriteObjectStart(jw, el->assemblyAccession);
-        jsonWriteString(jw, "asmName", el->asmName);
-        jsonWriteString(jw, "asmSubmitter", el->asmSubmitter);
-        jsonWriteString(jw, "annotationProvider", el->annotationProvider);
-        jsonWriteString(jw, "assemblyType", el->assemblyType);
-        jsonWriteString(jw, "bioProject", el->bioproject);
-        jsonWriteString(jw, "bioSample", el->biosample);
-        jsonWriteString(jw, "organismName", el->organismName);
-        jsonWriteString(jw, "isolate", el->isolate);
-        jsonWriteNumber(jw, "taxId", (long long)el->taxId);
-        jsonWriteObjectEnd(jw);
+//            jsonWriteString(jw, "gcAccession", el->genArk->gcAccession);
+	    /* some of these are going to be dups of the asmSummary */
+	    finiGenArk(jw, el->genArk);
+	    sciNameDone = TRUE;
+	    comNameDone = TRUE;
+//            jsonWriteNumber(jw, "taxId", (long long)genome->taxId);
+	    }
+	if (el->dbDb)
+	    {
+            jsonWriteString(jw, "database", el->dbDb->name);
+            jsonWriteString(jw, "description", el->dbDb->description);
+            jsonWriteString(jw, "sourceName", el->dbDb->sourceName);
+	    if (!sciNameDone)
+		jsonWriteString(jw, "scientificName", el->dbDb->scientificName);
+	    if (!comNameDone)
+		jsonWriteString(jw, "commonName", el->dbDb->organism);
+//            jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId);
+	    }
+	}
+    else if (el->genArk)	/* ONLY genArk ?? - should have asmSummary TBD */
+	{
+	if (jsonOutputArrays)
+	    {
+	    jsonWriteObjectStart(jw, NULL);
+	    jsonWriteString(jw, "gcAccession", el->genArk->gcAccession);
+	    }
+	else
+	    jsonWriteObjectStart(jw, el->genArk->gcAccession);
+
+	finiGenArk(jw, el->genArk);
+	if (el->dbDb)
+	    {
+	    jsonWriteBoolean(jw, "isUcscDatabase", TRUE);
+            jsonWriteString(jw, "database", el->dbDb->name);
+            jsonWriteString(jw, "description", el->dbDb->description);
+            jsonWriteString(jw, "sourceName", el->dbDb->sourceName);
+//	    if (!sciNameDone)
+//		jsonWriteString(jw, "scientificName", el->dbDb->scientificName);
+//	    if (!comNameDone)
+//		jsonWriteString(jw, "commonName", el->dbDb->organism);
+//            jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId);
+	    }
+	}
+    else if (el->dbDb)	/* ONLY dbDb ?  - should have the other two TBD */
+	{
+	if (jsonOutputArrays)
+	    {
+	    jsonWriteObjectStart(jw, NULL);
+	    jsonWriteString(jw, "database", el->dbDb->name);
+	    }
+	else
+	    jsonWriteObjectStart(jw, el->dbDb->name);
+
+	jsonWriteBoolean(jw, "isUcscDatabase", TRUE);
+	jsonWriteString(jw, "description", el->dbDb->description);
+	jsonWriteString(jw, "sourceName", el->dbDb->sourceName);
+	jsonWriteString(jw, "scientificName", el->dbDb->scientificName);
+	jsonWriteString(jw, "commonName", el->dbDb->organism);
+	jsonWriteNumber(jw, "taxId", (long long)el->dbDb->taxId);
 	}
+
+    jsonWriteObjectEnd(jw);
     ++itemCount;
     }
 endGenomes(jw);
 return (itemCount);
-}	/*	static int outputFullText(struct jsonWrite *jw, struct asmSummary *summaryList, ) */
+}	/*	static int outputCombo(struct jsonWrite *jw, struct combinedSummary *summaryList) */
+
+static struct combinedSummary *checkForGenArk(struct sqlConnection *conn, struct asmSummary *list)
+/* given an asmSummary list, see if any match to genArk genomes */
+/* TBD - should also check here for UCSC database matching */
+{
+struct combinedSummary *comboSumary = NULL;
+if (list)
+    {
+    struct asmSummary *el = NULL;
+    for (el=list; el != NULL; el=el->next)
+        {
+        struct combinedSummary *cs = NULL;
+        AllocVar(cs);
+        cs->summary = el;
+        cs->genArk = NULL;
+        cs->dbDb = NULL;
+        char query[4096];
+        sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession = \"%s\"", genarkTable, el->assemblyAccession);
+        struct genark *gA = genarkLoadByQuery(conn, query);
+        if (gA)
+            cs->genArk = gA;
+        slAddHead(&comboSumary, cs);
+        }
+    }
+if (comboSumary)
+  slReverse(&comboSumary);
+return comboSumary;
+}
 
 static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw)
 /* conn is a connection to hgcentral, searching for one word,
- *   might be a database or a GenArk hub.
+ *   might be a UCSC database or a GenArk hub accession, or just some word.
  */
 {
 /* the input word might be a database alias
  * asmAliasFind returns the searchWord if no alias found
  */
 char *perhapsAlias = asmAliasFind(searchWord);
-char *genarkTable = genarkTableName();
+
 int itemCount = 0;
+
 if (startsWith("GC", perhapsAlias))
     {
     char query[4096];
     sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE \"%s%%\"", genarkTable, perhapsAlias);
     struct sqlResult *sr = sqlGetResult(conn, query);
     char **row;
-    /* XXXX - TBD need to decide on a common set of output items */
-//    startGenomes(jw);
     struct genark *genArkList = NULL;
     while ((row = sqlNextRow(sr)) != NULL)
         {
         struct genark *genome = genarkLoad(row);
         slAddHead(&genArkList, genome);
-        ++itemCount;
-if (1 == 0) {
-        if (jsonOutputArrays)
-            {
-            jsonWriteObjectStart(jw, NULL);
-            jsonWriteString(jw, "gcAccession", genome->gcAccession);
-            jsonWriteString(jw, "hubUrl", genome->hubUrl);
-            jsonWriteString(jw, "asmName", genome->asmName);
-            jsonWriteString(jw, "scientificName", genome->scientificName);
-            jsonWriteString(jw, "commonName", genome->commonName);
-            jsonWriteNumber(jw, "taxId", (long long)genome->taxId);
-            jsonWriteObjectEnd(jw);
         }
-        else
-            {
-            jsonWriteObjectStart(jw, genome->gcAccession);
-            jsonWriteString(jw, "hubUrl", genome->hubUrl);
-            jsonWriteString(jw, "asmName", genome->asmName);
-            jsonWriteString(jw, "scientificName", genome->scientificName);
-            jsonWriteString(jw, "commonName", genome->commonName);
-            jsonWriteNumber(jw, "taxId", (long long)genome->taxId);
-            jsonWriteObjectEnd(jw);
-            }
-}
-        }
-//    endGenomes(jw);
     sqlFreeResult(&sr);
     slReverse(&genArkList);
-    struct asmSummary *summaryList = NULL;
+    struct combinedSummary *comboOutput = NULL;
     struct genark *el = NULL;
     for (el = genArkList;  el != NULL; el = el->next)
 	{
+        struct combinedSummary *cs = NULL;
+        AllocVar(cs);
+        cs->summary = NULL;
+        cs->genArk = el;
+        cs->dbDb = NULL;
 	char asmSumQuery[4096];
 	sqlSafef(asmSumQuery, sizeof(asmSumQuery), "SELECT * FROM asmSummary WHERE assemblyAccession = \"%s\"", el->gcAccession);
-	struct asmSummary *el = asmSummaryLoadByQuery(conn, asmSumQuery);
-	slAddHead(&summaryList, el);
+	struct asmSummary *as = asmSummaryLoadByQuery(conn, asmSumQuery);
+        if (as)
+            cs->summary = as;
+	slAddHead(&comboOutput, cs);
 	}
-    if (summaryList)
+    if (comboOutput)
 	{
-	itemCount = outputFullText(jw, summaryList);
+	slReverse(&comboOutput);
+	itemCount = outputCombo(jw, comboOutput);
 	}
-    } else {	/* not a GC genArk name, perhaps a UCSC database */
+    }	/*	if (startsWith("GC", perhapsAlias))	*/
+else
+    {	/* not a GC genArk name, perhaps a UCSC database */
     if (hDbIsActive(perhapsAlias))
         {
+	struct combinedSummary *comboOutput = NULL;
+        AllocVar(comboOutput);
+        comboOutput->summary = NULL;
+        comboOutput->genArk = NULL;
+	/* TBD need to get an equivalent GC accession name for UCSC database */
         struct dbDb *dbDbEntry = hDbDb(perhapsAlias);
-            startGenomes(jw);
-	    if (jsonOutputArrays)
-		{
-		jsonWriteObjectStart(jw, NULL);
-		jsonWriteString(jw, "database", dbDbEntry->name);
-		jsonWriteString(jw, "description", dbDbEntry->description);
-		jsonWriteString(jw, "sourceName", dbDbEntry->sourceName);
-		jsonWriteString(jw, "scientificName", dbDbEntry->scientificName);
-		jsonWriteString(jw, "commonName", dbDbEntry->organism);
-		jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId);
-		jsonWriteObjectEnd(jw);
-		}
-	    else
-		{
-		jsonWriteObjectStart(jw, dbDbEntry->name);
-		jsonWriteString(jw, "description", dbDbEntry->description);
-		jsonWriteString(jw, "sourceName", dbDbEntry->sourceName);
-		jsonWriteString(jw, "scientificName", dbDbEntry->scientificName);
-		jsonWriteString(jw, "commonName", dbDbEntry->organism);
-		jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId);
-		jsonWriteObjectEnd(jw);
-		}
-	    endGenomes(jw);
-            ++itemCount;
+        comboOutput->dbDb = dbDbEntry;
+	itemCount = outputCombo(jw, comboOutput);
         }
-	else	/* not genArk and not UCSC, check the asmSummary data */
+    }	/*	checked genArk and UCSC database */
+if (0 == itemCount)	/* not found in genark or ucsc db, check asmSummary */
     {
     long long totalMatch = 0;
     /* no more than 100 items result please */
     struct asmSummary *summaryList = asmSummaryFullText(conn, perhapsAlias, (long long) 100, &totalMatch);
-            if (summaryList)
-		{
-                itemCount = outputFullText(jw, summaryList);
-		}
-	    }
+    /* now check the genark table to see if there are any of those */
+    struct combinedSummary *comboOutput = checkForGenArk(conn, summaryList);
+    if (comboOutput)
+	itemCount = outputCombo(jw, comboOutput);
     }
 return itemCount;
 }	/*	static void singleWordSearch(struct sqlConnection *conn, char *searchWord) */
 
 void apiFindGenome(char *pathString[MAX_PATH_INFO])
 /* 'findGenome' function */
 {
 char *searchString = cgiOptionalString(argGenomeSearchTerm);
 char *inputSearchString = cloneString(searchString);
 char *extraArgs = verifyLegalArgs(argFindGenome);
 
 if (extraArgs)
     apiErrAbort(err400, err400Msg, "extraneous arguments found for function /findGenome'%s'", extraArgs);
 
 struct sqlConnection *conn = hConnectCentral();
-char *genarkTable = genarkTableName();
+genarkTable = genarkTableName();
+
 if (!sqlTableExists(conn, genarkTable))
     apiErrAbort(err500, err500Msg, "missing central.genark table in function /findGenome'%s'", extraArgs);
 
 /* verify number of words in search string is legal */
 int wordCount = chopByWhite(searchString, NULL, 0);
 
 if (wordCount < 1)
     apiErrAbort(err400, err400Msg, "search term '%s' does not contain a word ? for function /findGenome", argGenomeSearchTerm);
 if (wordCount > 5)
     apiErrAbort(err400, err400Msg, "search term '%s=%s' should not have more than 5 words for function /findGenome", argGenomeSearchTerm, searchString);
 
+genarkTable = genarkTableName();
+
 struct jsonWrite *jw = apiStartOutput();
 jsonWriteString(jw, argGenomeSearchTerm, searchString);
 
 int itemCount = 0;
 /* save the search string before it is chopped */
 char *pristineSearchString = cloneString(searchString);
 
 char **words;
 AllocArray(words, wordCount);
 (void) chopByWhite(searchString, words, wordCount);
 if (1 == wordCount)
     {
     itemCount = singleWordSearch(conn, words[0], jw);
     if (itemCount)
 	jsonWriteNumber(jw, "itemCount", itemCount);
     else
 	verbose(0, "# DBG need to search this word %s somewhere else\n", words[0]);
     }
-else
+else	/* multiple word search */
     {
     long long totalMatch = 0;
     /* no more than 100 items result please */
     struct asmSummary *summaryList = asmSummaryFullText(conn, pristineSearchString, (long long) 100, &totalMatch);
     if (summaryList)
 	{
-	itemCount = outputFullText(jw, summaryList);
+	struct combinedSummary *comboOutput = checkForGenArk(conn, summaryList);
+	if (comboOutput)
+	    itemCount = outputCombo(jw, comboOutput);
 	if (itemCount)
 	    jsonWriteNumber(jw, "itemCount", itemCount);
 	else
-	    verbose(0, "# DBG need to search this word %s somewhere else\n", words[0]);
-	}
-#ifdef NOT
-    for (int w = 0; w < wordCount; w++)
-        {
-        if (startsWith("GC", words[w]))
-            {
-            char query[4096];
-            sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE '%s%%'", genarkTable, words[w]);
-            struct sqlResult *sr = sqlGetResult(conn, query);
-            char **row;
-            while ((row = sqlNextRow(sr)) != NULL)
-                {
-                struct genark *genome = genarkLoad(row);
-                ++itemCount;
-                if (jsonOutputArrays)
-                    {
-                    jsonWriteListStart(jw, NULL);
-                    jsonWriteString(jw, "gcAccession", genome->gcAccession);
-                    jsonWriteString(jw, "hubUrl", genome->hubUrl);
-                    jsonWriteString(jw, "asmName", genome->asmName);
-                    jsonWriteString(jw, "scientificName", genome->scientificName);
-                    jsonWriteString(jw, "commonName", genome->commonName);
-                    jsonWriteNumber(jw, "taxId", (long long)genome->taxId);
-                    jsonWriteListEnd(jw);
+	    verbose(0, "# DBG need to search this string '%s' somewhere else\n", pristineSearchString);
 	}
     else
-                    {
-                    jsonWriteObjectStart(jw, NULL);
-                    jsonWriteString(jw, "gcAccession", genome->gcAccession);
-                    jsonWriteString(jw, "hubUrl", genome->hubUrl);
-                    jsonWriteString(jw, "asmName", genome->asmName);
-                    jsonWriteString(jw, "scientificName", genome->scientificName);
-                    jsonWriteString(jw, "commonName", genome->commonName);
-                    jsonWriteNumber(jw, "taxId", (long long)genome->taxId);
-                    jsonWriteObjectEnd(jw);
-                    }
-                }
-            }
-        }
-#endif
+	verbose(0, "# DBG need to search this string '%s' somewhere else\n", pristineSearchString);
     }
 
 if (itemCount)
     apiFinishOutput(0, NULL, jw);
 else
     apiErrAbort(err400, err400Msg, "no genomes found matching search term %s='%s' for endpoint: /findGenome", argGenomeSearchTerm, inputSearchString);
 
 hDisconnectCentral(&conn);
 
 }