d947dcc75b04fd0dc921c0eb02ed988d7b4819f0 braney Mon Jun 30 16:00:56 2025 -0700 let genark hubs appear in hgConvert if we can figure out their UCSC org diff --git src/hg/lib/genark.c src/hg/lib/genark.c index 9d31bd1d39b..df599a2ae91 100644 --- src/hg/lib/genark.c +++ src/hg/lib/genark.c @@ -1,385 +1,467 @@ /* genark.c was originally generated by the autoSql program, which also * generated genark.h and genark.sql. This module links the database and * the RAM representation of objects. */ #include <limits.h> #include "common.h" #include "linefile.h" #include "dystring.h" #include "jksql.h" #include "genark.h" #include "hgConfig.h" #include "hdb.h" +#include "dbDb.h" char *genarkCommaSepFieldNames = "gcAccession,hubUrl,asmName,scientificName,commonName,taxId,priority,clade"; void genarkStaticLoad(char **row, struct genark *ret) /* Load a row from genark table into ret. The contents of ret will * be replaced at the next call to this function. */ { int colCount = genArkColumnCount(); ret->gcAccession = row[0]; ret->hubUrl = row[1]; ret->asmName = row[2]; ret->scientificName = row[3]; ret->commonName = row[4]; ret->taxId = sqlSigned(row[5]); ret->priority = 0; if (colCount > 6) { ret->priority = sqlSigned(row[6]); } if (colCount > 7) ret->clade = row[7]; else ret->clade = cloneString("n/a"); } struct genark *genarkLoadByQuery(struct sqlConnection *conn, char *query) /* Load all genark from table that satisfy the query given. * Where query is of the form 'select * from example where something=something' * or 'select example.* from example, anotherTable where example.something = * anotherTable.something'. * Dispose of this with genarkFreeList(). */ { struct genark *list = NULL, *el; struct sqlResult *sr; char **row; sr = sqlGetResult(conn, query); while ((row = sqlNextRow(sr)) != NULL) { el = genarkLoad(row); slAddHead(&list, el); } slReverse(&list); sqlFreeResult(&sr); return list; } void genarkSaveToDb(struct sqlConnection *conn, struct genark *el, char *tableName, int updateSize) /* Save genark as a row to the table specified by tableName. * As blob fields may be arbitrary size updateSize specifies the approx size * of a string that would contain the entire query. Arrays of native types are * converted to comma separated strings and loaded as such, User defined types are * inserted as NULL. This function automatically escapes quoted strings for mysql. */ { struct dyString *update = dyStringNew(updateSize); sqlDyStringPrintf(update, "insert into %s values ( '%s','%s','%s','%s','%s',%d,%d,'%s')", tableName, el->gcAccession, el->hubUrl, el->asmName, el->scientificName, el->commonName, el->taxId, el->priority, el->clade); sqlUpdate(conn, update->string); dyStringFree(&update); } struct genark *genarkLoad(char **row) /* Load a genark from row fetched with select * from genark * from database. Dispose of this with genarkFree(). */ { int colCount = genArkColumnCount(); struct genark *ret; AllocVar(ret); ret->gcAccession = cloneString(row[0]); ret->hubUrl = cloneString(row[1]); ret->asmName = cloneString(row[2]); ret->scientificName = cloneString(row[3]); ret->commonName = cloneString(row[4]); ret->taxId = sqlSigned(row[5]); ret->priority = 0; if (colCount > 6) { ret->priority = sqlSigned(row[6]); } if (colCount > 7) ret->clade = row[7]; else ret->clade = cloneString("n/a"); return ret; } struct genark *genarkLoadAll(char *fileName) /* Load all genark from a whitespace-separated file. * Dispose of this with genarkFreeList(). */ { struct genark *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[8]; while (lineFileRow(lf, row)) { el = genarkLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genark *genarkLoadAllByChar(char *fileName, char chopper) /* Load all genark from a chopper separated file. * Dispose of this with genarkFreeList(). */ { struct genark *list = NULL, *el; struct lineFile *lf = lineFileOpen(fileName, TRUE); char *row[8]; while (lineFileNextCharRow(lf, chopper, row, ArraySize(row))) { el = genarkLoad(row); slAddHead(&list, el); } lineFileClose(&lf); slReverse(&list); return list; } struct genark *genarkCommaIn(char **pS, struct genark *ret) /* Create a genark out of a comma separated string. * This will fill in ret if non-null, otherwise will * return a new genark */ { char *s = *pS; if (ret == NULL) AllocVar(ret); ret->gcAccession = sqlStringComma(&s); ret->hubUrl = sqlStringComma(&s); ret->asmName = sqlStringComma(&s); ret->scientificName = sqlStringComma(&s); ret->commonName = sqlStringComma(&s); ret->taxId = sqlSignedComma(&s); ret->priority = sqlSignedComma(&s); ret->clade = sqlStringComma(&s); *pS = s; return ret; } void genarkFree(struct genark **pEl) /* Free a single dynamically allocated genark such as created * with genarkLoad(). */ { struct genark *el; if ((el = *pEl) == NULL) return; freeMem(el->gcAccession); freeMem(el->hubUrl); freeMem(el->asmName); freeMem(el->scientificName); freeMem(el->commonName); freeMem(el->clade); freez(pEl); } void genarkFreeList(struct genark **pList) /* Free a list of dynamically allocated genark's */ { struct genark *el, *next; for (el = *pList; el != NULL; el = next) { next = el->next; genarkFree(&el); } *pList = NULL; } void genarkOutput(struct genark *el, FILE *f, char sep, char lastSep) /* Print out genark. Separate fields with sep. Follow last field with lastSep. */ { if (sep == ',') fputc('"',f); fprintf(f, "%s", el->gcAccession); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->hubUrl); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->asmName); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->scientificName); if (sep == ',') fputc('"',f); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->commonName); if (sep == ',') fputc('"',f); fputc(sep,f); fprintf(f, "%d", el->taxId); fputc(sep,f); fprintf(f, "%d", el->priority); fputc(sep,f); if (sep == ',') fputc('"',f); fprintf(f, "%s", el->clade); if (sep == ',') fputc('"',f); fputc(lastSep,f); } void genarkJsonOutput(struct genark *el, FILE *f) /* Print out genark in JSON format. */ { fputc('{',f); fputc('"',f); fprintf(f,"gcAccession"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->gcAccession); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"hubUrl"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->hubUrl); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"asmName"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->asmName); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"scientificName"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->scientificName); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"commonName"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->commonName); fputc('"',f); fputc(',',f); fputc('"',f); fprintf(f,"taxId"); fputc('"',f); fputc(':',f); fprintf(f, "%d", el->taxId); fputc(',',f); fputc('"',f); fprintf(f,"priority"); fputc('"',f); fputc(':',f); fprintf(f, "%d", el->priority); fputc(',',f); fputc('"',f); fprintf(f,"clade"); fputc('"',f); fputc(':',f); fputc('"',f); fprintf(f, "%s", el->clade); fputc('"',f); fputc('}',f); } /* -------------------------------- End autoSql Generated Code -------------------------------- */ char *genarkUrl(char *accession) /* Return the URL to the genark assembly with this accession if present, * otherwise return NULL * */ { char *genarkPrefix = cfgOption("genarkHubPrefix"); if (genarkPrefix == NULL) return NULL; struct sqlConnection *conn = hConnectCentral(); if (!sqlTableExists(conn, genarkTableName())) return NULL; char *url = NULL; char query[4096]; char buffer[4096]; sqlSafef(query, sizeof query, "select hubUrl from %s where gcAccession='%s'", genarkTableName(), accession); if (sqlQuickQuery(conn, query, buffer, sizeof buffer)) { char buffer2[4096]; safef(buffer2, sizeof buffer2, "%s/%s", genarkPrefix, buffer); url = cloneString(buffer2); } hDisconnectCentral(&conn); return url; } char *genArkPath(char *genome) /* given a GenArk hub genome name, e.g. GCA_021951015.1 return the path: * GCA/021/951/015 * prefix that with desired server URL: https://hgdownload.soe.ucsc.edu/hubs/ * if desired. Or suffix add /hub.txt to get the hub.txt URL * The path returned does not depend upon this GCx_ naming scheme, * it simply uses the hub URL as returned from genarkUrl(genome) and * returns the middle part without the https://... prefix */ { if (isEmpty(genome)) return NULL; char *url = genarkUrl(genome); if (isEmpty(url)) return NULL; char *genarkPrefix = cfgOption("genarkHubPrefix"); stripString(url, genarkPrefix); stripString(url, "/hub.txt"); stripString(url, genome); /* remove the trailing / */ trimLastChar(url); /* the ++url skips the leading / character*/ return cloneString(++url); } static char *_genarkTableName = NULL; char *genarkTableName() /* return the genark table name from the environment, * or hg.conf, or use the default. Cache the result */ { if (_genarkTableName == NULL) _genarkTableName = cfgOptionEnvDefault("HGDB_GENARK_STATUS_TABLE", genarkTableConfVariable, defaultGenarkTableName); return _genarkTableName; } /* temporary function while the genark table is in transistion with * new coluns being added, July 2024. Allows compatibility with existing * genark table. */ int genArkColumnCount() /* return number of columns in genark table */ { static int colCount = 0; if (colCount > 0) return colCount; char *centralProfile = "central"; char *centralDb = cfgOption2(centralProfile, "db"); struct sqlConnection *conn = hConnectCentral(); if (!sqlTableExists(conn, genarkTableName())) return colCount; char query[4096]; sqlSafef(query, sizeof query, "SELECT count(*) FROM information_schema.columns WHERE table_schema = '%s' AND table_name = '%s'", centralDb, genarkTableName()); colCount = sqlQuickNum(conn, query); hDisconnectCentral(&conn); return colCount; } boolean isGenArk(char *genome) /* given a genome name, see if it is in the genark table to determine * yes/no this is a genark genome assembly */ { if (isEmpty(genome)) return FALSE; char *url = genarkUrl(genome); if (isEmpty(url)) return FALSE; return TRUE; } + +struct dbDb * genarkMakeDbDb(char **row) +/* Fake a dbDb structure for a Genark hub. */ +{ +struct dbDb *dbDb; +struct hash *orgHash = genarkGetOrgHash(); + +AllocVar(dbDb); + +dbDb->name = cloneString(row[0]); +dbDb->nibPath = cloneString("genark"); +dbDb->description = cloneString(row[4]); // commonName +dbDb->scientificName = cloneString(row[3]); +dbDb->taxId = atoi(row[5]); +dbDb->genome = hashFindVal(orgHash, row[0]); +dbDb->orderKey = 99999; +dbDb->defaultPos = "default"; +if (dbDb->genome == NULL) + dbDb->genome = "Other"; + +return dbDb; +} + +struct dbDb *genarkLiftOverDbs(char *listOfAccs) +/* return list of dbDb structures for the genark genomes that match listOfAccs */ +{ +if (!cfgOption("genarkLiftOver")) + return NULL; +struct dbDb *list = NULL; +char query[64 * 1024]; + +safef(query, sizeof query, "NOSQLINJ select * from %s where gcAccession in (%s)", genarkTableName(), listOfAccs); +struct sqlConnection *conn = hConnectCentral(); +struct sqlResult *sr; +char **row; + +sr = sqlGetResult(conn, query); +while ((row = sqlNextRow(sr)) != NULL) + { + struct dbDb *dbDb = genarkMakeDbDb(row); + slAddHead(&list, dbDb); + } +slReverse(&list); +sqlFreeResult(&sr); +hDisconnectCentral(&conn); +return list; +} + +struct dbDb *genarkLiftOverDb(char *acc) +/* return dbDb structure for GC* acc */ +{ +char query[4096]; +safef(query, sizeof query, "'%s'", acc); + +return genarkLiftOverDbs(query); +} + +struct hash *genarkGetOrgHash() +/* read table that maps gcAccession to UCSC org. */ +{ +static struct hash *orgHash = NULL; + +if (orgHash != NULL) + return orgHash; +char query[64 * 1024]; + +sqlSafef(query, sizeof query, "select * from %s", "genarkOrg"); +struct sqlConnection *conn = hConnectCentral(); +struct sqlResult *sr; +char **row; +orgHash = newHash(0); + +sr = sqlGetResult(conn, query); +while ((row = sqlNextRow(sr)) != NULL) + { + hashAdd(orgHash, cloneString(row[0]), cloneString(row[1])); + } +sqlFreeResult(&sr); +hDisconnectCentral(&conn); +return orgHash; +}