ba87d8471c9c78f15412ccad2aaba9b46d1af38e
hiram
  Tue Jul 2 16:14:47 2024 -0700
eliminte the dependency upon specific genArk genome names, use the genark table for questions about existence, refs #32596

diff --git src/hg/lib/genark.c src/hg/lib/genark.c
index 0a660b7..9d31bd1 100644
--- src/hg/lib/genark.c
+++ src/hg/lib/genark.c
@@ -1,367 +1,385 @@
 /* genark.c was originally generated by the autoSql program, which also
  * generated genark.h and genark.sql.  This module links the database and
  * the RAM representation of objects. */
 
 #include <limits.h>
 #include "common.h"
 #include "linefile.h"
 #include "dystring.h"
 #include "jksql.h"
 #include "genark.h"
 #include "hgConfig.h"
 #include "hdb.h"
 
 char *genarkCommaSepFieldNames = "gcAccession,hubUrl,asmName,scientificName,commonName,taxId,priority,clade";
 
 void genarkStaticLoad(char **row, struct genark *ret)
 /* Load a row from genark table into ret.  The contents of ret will
  * be replaced at the next call to this function. */
 {
 int colCount = genArkColumnCount();
 
 ret->gcAccession = row[0];
 ret->hubUrl = row[1];
 ret->asmName = row[2];
 ret->scientificName = row[3];
 ret->commonName = row[4];
 ret->taxId = sqlSigned(row[5]);
 ret->priority = 0;
 if (colCount > 6) { ret->priority = sqlSigned(row[6]); }
 if (colCount > 7)
     ret->clade = row[7];
 else
     ret->clade = cloneString("n/a");
 }
 
 struct genark *genarkLoadByQuery(struct sqlConnection *conn, char *query)
 /* Load all genark from table that satisfy the query given.
  * Where query is of the form 'select * from example where something=something'
  * or 'select example.* from example, anotherTable where example.something =
  * anotherTable.something'.
  * Dispose of this with genarkFreeList(). */
 {
 struct genark *list = NULL, *el;
 struct sqlResult *sr;
 char **row;
 
 sr = sqlGetResult(conn, query);
 while ((row = sqlNextRow(sr)) != NULL)
     {
     el = genarkLoad(row);
     slAddHead(&list, el);
     }
 slReverse(&list);
 sqlFreeResult(&sr);
 return list;
 }
 
 void genarkSaveToDb(struct sqlConnection *conn, struct genark *el, char *tableName, int updateSize)
 /* Save genark as a row to the table specified by tableName.
  * As blob fields may be arbitrary size updateSize specifies the approx size
  * of a string that would contain the entire query. Arrays of native types are
  * converted to comma separated strings and loaded as such, User defined types are
  * inserted as NULL. This function automatically escapes quoted strings for mysql. */
 {
 struct dyString *update = dyStringNew(updateSize);
 sqlDyStringPrintf(update, "insert into %s values ( '%s','%s','%s','%s','%s',%d,%d,'%s')",
 	tableName,  el->gcAccession,  el->hubUrl,  el->asmName,  el->scientificName,  el->commonName,  el->taxId,  el->priority,  el->clade);
 sqlUpdate(conn, update->string);
 dyStringFree(&update);
 }
 
 struct genark *genarkLoad(char **row)
 /* Load a genark from row fetched with select * from genark
  * from database.  Dispose of this with genarkFree(). */
 {
 int colCount = genArkColumnCount();
 struct genark *ret;
 
 AllocVar(ret);
 ret->gcAccession = cloneString(row[0]);
 ret->hubUrl = cloneString(row[1]);
 ret->asmName = cloneString(row[2]);
 ret->scientificName = cloneString(row[3]);
 ret->commonName = cloneString(row[4]);
 ret->taxId = sqlSigned(row[5]);
 ret->priority = 0;
 if (colCount > 6) { ret->priority = sqlSigned(row[6]); }
 if (colCount > 7)
     ret->clade = row[7];
 else
     ret->clade = cloneString("n/a");
 return ret;
 }
 
 struct genark *genarkLoadAll(char *fileName)
 /* Load all genark from a whitespace-separated file.
  * Dispose of this with genarkFreeList(). */
 {
 struct genark *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[8];
 
 while (lineFileRow(lf, row))
     {
     el = genarkLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct genark *genarkLoadAllByChar(char *fileName, char chopper)
 /* Load all genark from a chopper separated file.
  * Dispose of this with genarkFreeList(). */
 {
 struct genark *list = NULL, *el;
 struct lineFile *lf = lineFileOpen(fileName, TRUE);
 char *row[8];
 
 while (lineFileNextCharRow(lf, chopper, row, ArraySize(row)))
     {
     el = genarkLoad(row);
     slAddHead(&list, el);
     }
 lineFileClose(&lf);
 slReverse(&list);
 return list;
 }
 
 struct genark *genarkCommaIn(char **pS, struct genark *ret)
 /* Create a genark out of a comma separated string.
  * This will fill in ret if non-null, otherwise will
  * return a new genark */
 {
 char *s = *pS;
 
 if (ret == NULL)
     AllocVar(ret);
 ret->gcAccession = sqlStringComma(&s);
 ret->hubUrl = sqlStringComma(&s);
 ret->asmName = sqlStringComma(&s);
 ret->scientificName = sqlStringComma(&s);
 ret->commonName = sqlStringComma(&s);
 ret->taxId = sqlSignedComma(&s);
 ret->priority = sqlSignedComma(&s);
 ret->clade = sqlStringComma(&s);
 *pS = s;
 return ret;
 }
 
 void genarkFree(struct genark **pEl)
 /* Free a single dynamically allocated genark such as created
  * with genarkLoad(). */
 {
 struct genark *el;
 
 if ((el = *pEl) == NULL) return;
 freeMem(el->gcAccession);
 freeMem(el->hubUrl);
 freeMem(el->asmName);
 freeMem(el->scientificName);
 freeMem(el->commonName);
 freeMem(el->clade);
 freez(pEl);
 }
 
 void genarkFreeList(struct genark **pList)
 /* Free a list of dynamically allocated genark's */
 {
 struct genark *el, *next;
 
 for (el = *pList; el != NULL; el = next)
     {
     next = el->next;
     genarkFree(&el);
     }
 *pList = NULL;
 }
 
 void genarkOutput(struct genark *el, FILE *f, char sep, char lastSep)
 /* Print out genark.  Separate fields with sep. Follow last field with lastSep. */
 {
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->gcAccession);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->hubUrl);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->asmName);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->scientificName);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->commonName);
 if (sep == ',') fputc('"',f);
 fputc(sep,f);
 fprintf(f, "%d", el->taxId);
 fputc(sep,f);
 fprintf(f, "%d", el->priority);
 fputc(sep,f);
 if (sep == ',') fputc('"',f);
 fprintf(f, "%s", el->clade);
 if (sep == ',') fputc('"',f);
 fputc(lastSep,f);
 }
 
 void genarkJsonOutput(struct genark *el, FILE *f)
 /* Print out genark in JSON format. */
 {
 fputc('{',f);
 fputc('"',f);
 fprintf(f,"gcAccession");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->gcAccession);
 fputc('"',f);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"hubUrl");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->hubUrl);
 fputc('"',f);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"asmName");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->asmName);
 fputc('"',f);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"scientificName");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->scientificName);
 fputc('"',f);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"commonName");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->commonName);
 fputc('"',f);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"taxId");
 fputc('"',f);
 fputc(':',f);
 fprintf(f, "%d", el->taxId);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"priority");
 fputc('"',f);
 fputc(':',f);
 fprintf(f, "%d", el->priority);
 fputc(',',f);
 fputc('"',f);
 fprintf(f,"clade");
 fputc('"',f);
 fputc(':',f);
 fputc('"',f);
 fprintf(f, "%s", el->clade);
 fputc('"',f);
 fputc('}',f);
 }
 
 /* -------------------------------- End autoSql Generated Code -------------------------------- */
 
 char *genarkUrl(char *accession)
 /* Return the URL to the genark assembly with this accession if present,
  * otherwise return NULL
  * */
 {
 char *genarkPrefix = cfgOption("genarkHubPrefix");
 
 if (genarkPrefix == NULL)
     return NULL;
 
 struct sqlConnection *conn = hConnectCentral();
 if (!sqlTableExists(conn, genarkTableName()))
     return NULL;
 
 char *url = NULL;
 char query[4096];
 char buffer[4096];
 sqlSafef(query, sizeof query, "select hubUrl from %s where gcAccession='%s'", genarkTableName(), accession);
 if (sqlQuickQuery(conn, query, buffer, sizeof buffer))
     {
     char buffer2[4096];
     safef(buffer2, sizeof buffer2, "%s/%s", genarkPrefix, buffer);
 
     url = cloneString(buffer2);
     }
 
 hDisconnectCentral(&conn);
 
 return url;
 }
 
-char *genArkHubTxt(char *gcX)
-/* given a GC[AF]_012345678.9 name, return hub.txt URL */
+char *genArkPath(char *genome)
+/* given a GenArk hub genome name, e.g. GCA_021951015.1 return the path:
+ *               GCA/021/951/015
+ * prefix that with desired server URL: https://hgdownload.soe.ucsc.edu/hubs/
+ *   if desired.  Or suffix add /hub.txt to get the hub.txt URL
+ *   The path returned does not depend upon this GCx_ naming scheme,
+ *   it simply uses the hub URL as returned from genarkUrl(genome) and
+ *   returns the middle part without the https://... prefix
+ */
 {
-char hubTxt[PATH_MAX + 1024];
-/* temporary construction of the path */
-char tPath[PATH_MAX + 1024];
-safencpy(tPath, 4, gcX, 3);
-safencpy(tPath+3, 2, "/", 1);
-safencpy(tPath+4, 4, gcX+4, 3);
-safencpy(tPath+7, 2, "/", 1);
-safencpy(tPath+8, 4, gcX+7, 3);
-safencpy(tPath+11, 2, "/", 1);
-safencpy(tPath+12, 4, gcX+10, 3);
-safencpy(tPath+15, 2, "/", 1);
-safecpy(tPath+16, PATH_MAX-16, gcX);
-/* start the result with the genArkHubPrefix, add in tPath and /hub.txt */
-safef(hubTxt, sizeof(hubTxt), "%s/%s/hub.txt", cfgOption("genarkHubPrefix"),
-   tPath);
-return cloneString(hubTxt);  // no need to free this
+if (isEmpty(genome))
+    return NULL;
+
+char *url = genarkUrl(genome);
+if (isEmpty(url))
+    return NULL;
+char *genarkPrefix = cfgOption("genarkHubPrefix");
+stripString(url, genarkPrefix);
+stripString(url, "/hub.txt");
+stripString(url, genome);
+/* remove the trailing / */
+trimLastChar(url);
+/* the ++url skips the leading / character*/
+return cloneString(++url);
 }
 
 static char *_genarkTableName = NULL;
 
 char *genarkTableName()
 /* return the genark table name from the environment,
  * or hg.conf, or use the default.  Cache the result */
 {
 if (_genarkTableName == NULL)
     _genarkTableName = cfgOptionEnvDefault("HGDB_GENARK_STATUS_TABLE",
 	    genarkTableConfVariable, defaultGenarkTableName);
 
 return _genarkTableName;
 }
 
 /* temporary function while the genark table is in transistion with
  * new coluns being added, July 2024.  Allows compatibility with existing
  * genark table.
  */
 int genArkColumnCount()
 /* return number of columns in genark table */
 {
 static int colCount = 0;
 if (colCount > 0)
    return colCount;
 char *centralProfile = "central";
 char *centralDb = cfgOption2(centralProfile, "db");
 struct sqlConnection *conn = hConnectCentral();
 if (!sqlTableExists(conn, genarkTableName()))
     return colCount;
 char query[4096];
 sqlSafef(query, sizeof query, "SELECT count(*) FROM information_schema.columns WHERE table_schema = '%s' AND table_name = '%s'", centralDb, genarkTableName());
 colCount = sqlQuickNum(conn, query);
 hDisconnectCentral(&conn);
 return colCount;
 }
+
+boolean isGenArk(char *genome)
+/* given a genome name, see if it is in the genark table to determine
+ *  yes/no this is a genark genome assembly
+ */
+{
+if (isEmpty(genome))
+    return FALSE;
+char *url = genarkUrl(genome);
+if (isEmpty(url))
+    return FALSE;
+return TRUE;
+}