be00edb2a6ef8c715b715cf0261572feead28c21 hiram Sat Feb 3 13:11:41 2024 -0800 beginning to run FULLTEXT search on asmSummary table refs #23589 diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c index 47b66dd..452c37f 100644 --- src/hg/hubApi/findGenome.c +++ src/hg/hubApi/findGenome.c @@ -1,86 +1,282 @@ /* findGenome search functions */ #include "dataApi.h" #include "hgFind.h" #include "cartTrackDb.h" #include "cartJson.h" #include "genark.h" +#include "asmAlias.h" +#include "asmSummary.h" +static void startGenomes(struct jsonWrite *jw) +/* begin the list output */ +{ +if (jsonOutputArrays) + jsonWriteListStart(jw, "genomes"); +else + jsonWriteObjectStart(jw, "genomes"); +} + +static void endGenomes(struct jsonWrite *jw) +/* end the list output */ +{ +if (jsonOutputArrays) + jsonWriteListEnd(jw); +else + jsonWriteObjectEnd(jw); +} + +static int outputFullText(struct jsonWrite *jw, struct asmSummary *summaryList) +{ +int itemCount = 0; +/* XXX TBD need to get common names for these items */ +struct asmSummary *el = NULL; +startGenomes(jw); +for (el=summaryList; el != NULL; el=el->next) + { + if (jsonOutputArrays) + { + jsonWriteObjectStart(jw, NULL); + jsonWriteString(jw, "accession", el->assemblyAccession); + jsonWriteString(jw, "asmName", el->asmName); + jsonWriteString(jw, "asmSubmitter", el->asmSubmitter); + jsonWriteString(jw, "annotationProvider", el->annotationProvider); + jsonWriteString(jw, "assemblyType", el->assemblyType); + jsonWriteString(jw, "bioProject", el->bioproject); + jsonWriteString(jw, "bioSample", el->biosample); + jsonWriteString(jw, "organismName", el->organismName); + jsonWriteString(jw, "isolate", el->isolate); + jsonWriteNumber(jw, "taxId", (long long)el->taxId); + jsonWriteObjectEnd(jw); + } + else + { + jsonWriteObjectStart(jw, el->assemblyAccession); + jsonWriteString(jw, "asmName", el->asmName); + jsonWriteString(jw, "asmSubmitter", el->asmSubmitter); + jsonWriteString(jw, "annotationProvider", el->annotationProvider); + jsonWriteString(jw, "assemblyType", el->assemblyType); + jsonWriteString(jw, "bioProject", el->bioproject); + jsonWriteString(jw, "bioSample", el->biosample); + jsonWriteString(jw, "organismName", el->organismName); + jsonWriteString(jw, "isolate", el->isolate); + jsonWriteNumber(jw, "taxId", (long long)el->taxId); + jsonWriteObjectEnd(jw); + } + ++itemCount; + } +endGenomes(jw); +return (itemCount); +} /* static int outputFullText(struct jsonWrite *jw, struct asmSummary *summaryList, ) */ + +static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw) +/* conn is a connection to hgcentral, searching for one word, + * might be a database or a GenArk hub. + */ +{ +/* the input word might be a database alias + * asmAliasFind returns the searchWord if no alias found + */ +char *perhapsAlias = asmAliasFind(searchWord); +char *genarkTable = genarkTableName(); +int itemCount = 0; +if (startsWith("GC", perhapsAlias)) + { + char query[4096]; + sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE \"%s%%\"", genarkTable, perhapsAlias); + struct sqlResult *sr = sqlGetResult(conn, query); + char **row; + /* XXXX - TBD need to decide on a common set of output items */ +// startGenomes(jw); + struct genark *genArkList = NULL; + while ((row = sqlNextRow(sr)) != NULL) + { + struct genark *genome = genarkLoad(row); + slAddHead(&genArkList, genome); + ++itemCount; +if (1 == 0) { + if (jsonOutputArrays) + { + jsonWriteObjectStart(jw, NULL); + jsonWriteString(jw, "gcAccession", genome->gcAccession); + jsonWriteString(jw, "hubUrl", genome->hubUrl); + jsonWriteString(jw, "asmName", genome->asmName); + jsonWriteString(jw, "scientificName", genome->scientificName); + jsonWriteString(jw, "commonName", genome->commonName); + jsonWriteNumber(jw, "taxId", (long long)genome->taxId); + jsonWriteObjectEnd(jw); + } + else + { + jsonWriteObjectStart(jw, genome->gcAccession); + jsonWriteString(jw, "hubUrl", genome->hubUrl); + jsonWriteString(jw, "asmName", genome->asmName); + jsonWriteString(jw, "scientificName", genome->scientificName); + jsonWriteString(jw, "commonName", genome->commonName); + jsonWriteNumber(jw, "taxId", (long long)genome->taxId); + jsonWriteObjectEnd(jw); + } +} + } +// endGenomes(jw); + sqlFreeResult(&sr); + slReverse(&genArkList); + struct asmSummary *summaryList = NULL; + struct genark *el = NULL; + for (el = genArkList; el != NULL; el = el->next) + { + char asmSumQuery[4096]; + sqlSafef(asmSumQuery, sizeof(asmSumQuery), "SELECT * FROM asmSummary WHERE assemblyAccession = \"%s\"", el->gcAccession); + struct asmSummary *el = asmSummaryLoadByQuery(conn, asmSumQuery); + slAddHead(&summaryList, el); + } + if (summaryList) + { + itemCount = outputFullText(jw, summaryList); + } + } else { /* not a GC genArk name, perhaps a UCSC database */ + if (hDbIsActive(perhapsAlias)) + { + struct dbDb *dbDbEntry = hDbDb(perhapsAlias); + startGenomes(jw); + if (jsonOutputArrays) + { + jsonWriteObjectStart(jw, NULL); + jsonWriteString(jw, "database", dbDbEntry->name); + jsonWriteString(jw, "description", dbDbEntry->description); + jsonWriteString(jw, "sourceName", dbDbEntry->sourceName); + jsonWriteString(jw, "scientificName", dbDbEntry->scientificName); + jsonWriteString(jw, "commonName", dbDbEntry->organism); + jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId); + jsonWriteObjectEnd(jw); + } + else + { + jsonWriteObjectStart(jw, dbDbEntry->name); + jsonWriteString(jw, "description", dbDbEntry->description); + jsonWriteString(jw, "sourceName", dbDbEntry->sourceName); + jsonWriteString(jw, "scientificName", dbDbEntry->scientificName); + jsonWriteString(jw, "commonName", dbDbEntry->organism); + jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId); + jsonWriteObjectEnd(jw); + } + endGenomes(jw); + ++itemCount; + } + else /* not genArk and not UCSC, check the asmSummary data */ + { + long long totalMatch = 0; + /* no more than 100 items result please */ + struct asmSummary *summaryList = asmSummaryFullText(conn, perhapsAlias, (long long) 100, &totalMatch); + if (summaryList) + { + itemCount = outputFullText(jw, summaryList); + } + } + } +return itemCount; +} /* static void singleWordSearch(struct sqlConnection *conn, char *searchWord) */ void apiFindGenome(char *pathString[MAX_PATH_INFO]) /* 'findGenome' function */ { char *searchString = cgiOptionalString(argGenomeSearchTerm); char *inputSearchString = cloneString(searchString); char *extraArgs = verifyLegalArgs(argFindGenome); if (extraArgs) apiErrAbort(err400, err400Msg, "extraneous arguments found for function /findGenome'%s'", extraArgs); struct sqlConnection *conn = hConnectCentral(); char *genarkTable = genarkTableName(); if (!sqlTableExists(conn, genarkTable)) apiErrAbort(err500, err500Msg, "missing central.genark table in function /findGenome'%s'", extraArgs); /* verify number of words in search string is legal */ int wordCount = chopByWhite(searchString, NULL, 0); if (wordCount < 1) apiErrAbort(err400, err400Msg, "search term '%s' does not contain a word ? for function /findGenome", argGenomeSearchTerm); if (wordCount > 5) apiErrAbort(err400, err400Msg, "search term '%s=%s' should not have more than 5 words for function /findGenome", argGenomeSearchTerm, searchString); struct jsonWrite *jw = apiStartOutput(); jsonWriteString(jw, argGenomeSearchTerm, searchString); -int itemsFound = 0; +int itemCount = 0; +/* save the search string before it is chopped */ +char *pristineSearchString = cloneString(searchString); char **words; AllocArray(words, wordCount); (void) chopByWhite(searchString, words, wordCount); +if (1 == wordCount) + { + itemCount = singleWordSearch(conn, words[0], jw); + if (itemCount) + jsonWriteNumber(jw, "itemCount", itemCount); + else + verbose(0, "# DBG need to search this word %s somewhere else\n", words[0]); + } +else + { + long long totalMatch = 0; + /* no more than 100 items result please */ + struct asmSummary *summaryList = asmSummaryFullText(conn, pristineSearchString, (long long) 100, &totalMatch); + if (summaryList) + { + itemCount = outputFullText(jw, summaryList); + if (itemCount) + jsonWriteNumber(jw, "itemCount", itemCount); + else + verbose(0, "# DBG need to search this word %s somewhere else\n", words[0]); + } +#ifdef NOT for (int w = 0; w < wordCount; w++) { if (startsWith("GC", words[w])) { char query[4096]; sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE '%s%%'", genarkTable, words[w]); struct sqlResult *sr = sqlGetResult(conn, query); char **row; while ((row = sqlNextRow(sr)) != NULL) { struct genark *genome = genarkLoad(row); - ++itemsFound; + ++itemCount; if (jsonOutputArrays) { jsonWriteListStart(jw, NULL); jsonWriteString(jw, "gcAccession", genome->gcAccession); jsonWriteString(jw, "hubUrl", genome->hubUrl); jsonWriteString(jw, "asmName", genome->asmName); jsonWriteString(jw, "scientificName", genome->scientificName); jsonWriteString(jw, "commonName", genome->commonName); jsonWriteNumber(jw, "taxId", (long long)genome->taxId); jsonWriteListEnd(jw); } else { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "gcAccession", genome->gcAccession); jsonWriteString(jw, "hubUrl", genome->hubUrl); jsonWriteString(jw, "asmName", genome->asmName); jsonWriteString(jw, "scientificName", genome->scientificName); jsonWriteString(jw, "commonName", genome->commonName); jsonWriteNumber(jw, "taxId", (long long)genome->taxId); jsonWriteObjectEnd(jw); } } } } +#endif + } -if (itemsFound) +if (itemCount) apiFinishOutput(0, NULL, jw); else apiErrAbort(err400, err400Msg, "no genomes found matching search term %s='%s' for endpoint: /findGenome", argGenomeSearchTerm, inputSearchString); hDisconnectCentral(&conn); }