8d9253332f172e52c7ecb7fbc6ecde2bad30561f hiram Mon Feb 5 15:36:52 2024 -0800 with browserUrl and finding equivalent asmSummary for UCSC dbDb browser refs #32897 diff --git src/hg/hubApi/findGenome.c src/hg/hubApi/findGenome.c index 6152923..5bb88a8 100644 --- src/hg/hubApi/findGenome.c +++ src/hg/hubApi/findGenome.c @@ -79,106 +79,130 @@ jsonWriteNumber(jw, "genomeSize", (long long)el->genomeSize); jsonWriteNumber(jw, "genomeSizeUngapped", (long long)el->genomeSizeUngapped); jsonWriteDouble(jw, "gcPercent", (double)el->gcPercent); jsonWriteNumber(jw, "repliconCount", (long long)el->repliconCount); jsonWriteNumber(jw, "scaffoldCount", (long long)el->scaffoldCount); jsonWriteNumber(jw, "contigCount", (long long)el->contigCount); jsonWriteString(jw, "annotationProvider", el->annotationProvider); jsonWriteString(jw, "annotationName", el->annotationName); jsonWriteString(jw, "annotationDate", el->annotationDate); jsonWriteNumber(jw, "totalGeneCount", (long long)el->totalGeneCount); jsonWriteNumber(jw, "proteinCodingGeneCount", (long long)el->proteinCodingGeneCount); jsonWriteNumber(jw, "nonCodingGeneCount", (long long)el->nonCodingGeneCount); jsonWriteString(jw, "pubmedId", el->pubmedId); } +static void setBrowserUrl(struct jsonWrite *jw, char *browserId) +{ +char* host = getenv("HTTP_HOST"); +char browserUrl[1024]; +if (startsWith("GC", browserId)) + { + if (host) + safef(browserUrl, sizeof(browserUrl), "https://%s/h/%s", host, browserId); + else + safef(browserUrl, sizeof(browserUrl), "https://genome.ucsc.edu/h/%s", browserId); + } +else + { + if (host) + safef(browserUrl, sizeof(browserUrl), "https://%s/cgi-bin/hgTracks?db=%s", host, browserId); + else + safef(browserUrl, sizeof(browserUrl), "https://genome.ucsc.edu/cgi-bin/hgTracks?db=%s", browserId); + } +jsonWriteString(jw, "browserUrl", browserUrl); +} + static int outputCombo(struct jsonWrite *jw, struct combinedSummary *summaryList) /* may be information from any of the three tables */ { int itemCount = 0; struct combinedSummary *el = NULL; startGenomes(jw); for (el=summaryList; el != NULL; el=el->next) { if (el->summary) { if (jsonOutputArrays) { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "accession", el->summary->assemblyAccession); } else jsonWriteObjectStart(jw, el->summary->assemblyAccession); finiSummary(jw, el->summary); boolean sciNameDone = FALSE; boolean comNameDone = FALSE; if (el->genArk) { // jsonWriteString(jw, "gcAccession", el->genArk->gcAccession); /* some of these are going to be dups of the asmSummary */ + setBrowserUrl(jw, el->genArk->gcAccession); finiGenArk(jw, el->genArk); sciNameDone = TRUE; comNameDone = TRUE; // jsonWriteNumber(jw, "taxId", (long long)genome->taxId); } if (el->dbDb) { + setBrowserUrl(jw, el->dbDb->name); jsonWriteString(jw, "database", el->dbDb->name); jsonWriteString(jw, "description", el->dbDb->description); jsonWriteString(jw, "sourceName", el->dbDb->sourceName); if (!sciNameDone) jsonWriteString(jw, "scientificName", el->dbDb->scientificName); if (!comNameDone) jsonWriteString(jw, "commonName", el->dbDb->organism); // jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId); } } else if (el->genArk) /* ONLY genArk ?? - should have asmSummary TBD */ { if (jsonOutputArrays) { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "gcAccession", el->genArk->gcAccession); } else jsonWriteObjectStart(jw, el->genArk->gcAccession); - + setBrowserUrl(jw, el->genArk->gcAccession); finiGenArk(jw, el->genArk); if (el->dbDb) { jsonWriteBoolean(jw, "isUcscDatabase", TRUE); jsonWriteString(jw, "database", el->dbDb->name); jsonWriteString(jw, "description", el->dbDb->description); jsonWriteString(jw, "sourceName", el->dbDb->sourceName); // if (!sciNameDone) // jsonWriteString(jw, "scientificName", el->dbDb->scientificName); // if (!comNameDone) // jsonWriteString(jw, "commonName", el->dbDb->organism); // jsonWriteNumber(jw, "taxId", (long long)dbDbEntry->taxId); } } else if (el->dbDb) /* ONLY dbDb ? - should have the other two TBD */ { if (jsonOutputArrays) { jsonWriteObjectStart(jw, NULL); jsonWriteString(jw, "database", el->dbDb->name); } else jsonWriteObjectStart(jw, el->dbDb->name); + setBrowserUrl(jw, el->dbDb->name); jsonWriteBoolean(jw, "isUcscDatabase", TRUE); jsonWriteString(jw, "description", el->dbDb->description); jsonWriteString(jw, "sourceName", el->dbDb->sourceName); jsonWriteString(jw, "scientificName", el->dbDb->scientificName); jsonWriteString(jw, "commonName", el->dbDb->organism); jsonWriteNumber(jw, "taxId", (long long)el->dbDb->taxId); } jsonWriteObjectEnd(jw); ++itemCount; } endGenomes(jw); return (itemCount); } /* static int outputCombo(struct jsonWrite *jw, struct combinedSummary *summaryList) */ @@ -198,57 +222,110 @@ cs->genArk = NULL; cs->dbDb = NULL; char query[4096]; sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession = \"%s\"", genarkTable, el->assemblyAccession); struct genark *gA = genarkLoadByQuery(conn, query); if (gA) cs->genArk = gA; slAddHead(&comboSumary, cs); } } if (comboSumary) slReverse(&comboSumary); return comboSumary; } -static struct asmSummary *checkAsmSummary(struct sqlConnection *conn, char *oneAccession) -/* check the asmSummary table for the single accession LIKE */ +static struct asmSummary *checkAsmSummary(struct sqlConnection *conn, char *oneAccession, boolean exactMatch, long long limit) +/* check the asmSummary table for oneAccession, mayby exact, maybe a limit */ { char query[4096]; -sqlSafef(query, sizeof(query), "SELECT * FROM asmSummary WHERE assemblyAccession LIKE \"%s%%\"", oneAccession); +if (limit > 0) + { + if (exactMatch) + sqlSafef(query, sizeof(query), "SELECT * FROM asmSummary WHERE assemblyAccession = '%s' LIMIT %lld", oneAccession, limit); + else + sqlSafef(query, sizeof(query), "SELECT * FROM asmSummary WHERE assemblyAccession LIKE '%s%%' LIMIT %lld", oneAccession, limit); + } +else + { + if (exactMatch) + sqlSafef(query, sizeof(query), "SELECT * FROM asmSummary WHERE assemblyAccession = '%s'", oneAccession); + else + sqlSafef(query, sizeof(query), "SELECT * FROM asmSummary WHERE assemblyAccession LIKE '%s%%'", oneAccession); + } + struct asmSummary *list = asmSummaryLoadByQuery(conn, query); return list; } +// Field Type Null Key Default Extra +// source varchar(255) NO MUL NULL +// destination varchar(255) NO MUL NULL +// sourceAuthority enum('ensembl','ucsc','genbank','refseq') NO NULL +// destinationAuthority enum('ensembl','ucsc','genbank','refseq') NO NULL +// matchCount bigint(20) NO NULL +// sourceCount bigint(20) NO NULL +// destinationCount bigint(20) NO NULL + +struct asmSummary *dbDbAsmEquivalent(struct sqlConnection *centConn, char *dbDbName) +{ +struct asmSummary *itemReturn = NULL; +struct sqlConnection *conn = hAllocConn("hgFixed"); +char query[4096]; +sqlSafef(query, sizeof(query), "SELECT destination from asmEquivalent WHERE source='%s' AND sourceAuthority='ucsc' AND destinationAuthority='refseq' LIMIT 1", dbDbName); +char *asmIdFound = NULL; +char *genBank = NULL; +char *refSeq = sqlQuickString(conn, query); +if (refSeq) + asmIdFound = refSeq; +else + { + sqlSafef(query, sizeof(query), "SELECT destination from asmEquivalent WHERE source='%s' AND sourceAuthority='ucsc' AND destinationAuthority='genbank' LIMIT 1", dbDbName); + genBank = sqlQuickString(conn, query); + if (genBank) + asmIdFound = genBank; + } +if (asmIdFound) + { + char *words[3]; + int wordCount = chopString(asmIdFound, "_", words, ArraySize(words)); + if (wordCount > 2) + { + safef(query, sizeof(query), "%s_%s", words[0], words[1]); + itemReturn = checkAsmSummary(centConn, query, TRUE, 1); + } + } +return itemReturn; +} + static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw) /* conn is a connection to hgcentral, searching for one word, * might be a UCSC database or a GenArk hub accession, or just some word. */ { /* the input word might be a database alias * asmAliasFind returns the searchWord if no alias found */ char *perhapsAlias = asmAliasFind(searchWord); int itemCount = 0; if (startsWith("GC", perhapsAlias)) { struct asmSummary *asmSumFound = NULL; struct hash *asmSummaryHash = NULL; - asmSumFound = checkAsmSummary(conn, perhapsAlias); - verbose(0, "# found %d items\n", slCount(asmSumFound)); + asmSumFound = checkAsmSummary(conn, perhapsAlias, FALSE, 0); if (asmSumFound) { struct asmSummary *el; asmSummaryHash = newHash(0); for (el = asmSumFound; el != NULL; el = el->next) hashAdd(asmSummaryHash, el->assemblyAccession, (void *)el); } char query[4096]; sqlSafef(query, sizeof(query), "SELECT * FROM %s WHERE gcAccession LIKE \"%s%%\"", genarkTable, perhapsAlias); struct sqlResult *sr = sqlGetResult(conn, query); char **row; struct combinedSummary *comboOutput = NULL; while ((row = sqlNextRow(sr)) != NULL) { struct genark *genome = genarkLoad(row); @@ -282,33 +359,35 @@ } if (comboOutput) { slReverse(&comboOutput); itemCount = outputCombo(jw, comboOutput); } } /* if (startsWith("GC", perhapsAlias)) */ else { /* not a GC genArk name, perhaps a UCSC database */ if (hDbIsActive(perhapsAlias)) { struct combinedSummary *comboOutput = NULL; AllocVar(comboOutput); comboOutput->summary = NULL; comboOutput->genArk = NULL; - /* TBD need to get an equivalent GC accession name for UCSC database */ struct dbDb *dbDbEntry = hDbDb(perhapsAlias); comboOutput->dbDb = dbDbEntry; + struct asmSummary *sumList = dbDbAsmEquivalent(conn, perhapsAlias); + if (sumList) + comboOutput->summary = sumList; itemCount = outputCombo(jw, comboOutput); } } /* checked genArk and UCSC database */ if (0 == itemCount) /* not found in genark or ucsc db, check asmSummary */ { long long totalMatch = 0; /* no more than 100 items result please */ struct asmSummary *summaryList = asmSummaryFullText(conn, perhapsAlias, (long long) 100, &totalMatch); /* now check the genark table to see if there are any of those */ struct combinedSummary *comboOutput = checkForGenArk(conn, summaryList); if (comboOutput) itemCount = outputCombo(jw, comboOutput); } return itemCount; } /* static int singleWordSearch(struct sqlConnection *conn, char *searchWord, struct jsonWrite *jw) */ @@ -323,52 +402,52 @@ char **row; while ((row = sqlNextRow(sr)) != NULL) { if (strlen(row[0]) < 1) jsonWriteNumber(jw, "na", sqlLongLong(row[1])); else jsonWriteNumber(jw, row[0], sqlLongLong(row[1])); } sqlFreeResult(&sr); jsonWriteObjectEnd(jw); } static void doStatsOnly(struct sqlConnection *conn, struct jsonWrite *jw) /* only count the items in each database */ { -jsonWriteString(jw, "description:", "counting items in tables: asmSummary, genark and dbDb for number of genomes available"); +jsonWriteString(jw, "description", "counting items in tables: asmSummary, genark and dbDb for number of genomes available"); char query[4096]; sqlSafef(query, sizeof(query), "SELECT count(*) FROM %s", genarkTable); long long genArkCount = sqlQuickLongLong(conn, query); sqlSafef(query, sizeof(query), "SELECT count(*) FROM asmSummary"); long long asmSummaryTotal = sqlQuickLongLong(conn, query); sqlSafef(query, sizeof(query), "SELECT count(*) FROM dbDb where active=1"); long long dbDbCount = sqlQuickLongLong(conn, query); long long totalCount = genArkCount + asmSummaryTotal + dbDbCount; jsonWriteNumber(jw, "totalCount", totalCount); jsonWriteObjectStart(jw, "genark"); -jsonWriteString(jw, "TBD:", "the genark table can count GCF vs. GCA and will have a clade category to couint\n"); +jsonWriteString(jw, "TBD", "the genark table can count GCF vs. GCA and will have a clade category to couint\n"); jsonWriteNumber(jw, "itemCount", genArkCount); jsonWriteObjectEnd(jw); jsonWriteObjectStart(jw, "dbDb"); -jsonWriteString(jw, "description:", "the dbDb table is a count of UCSC 'database' browser instances"); +jsonWriteString(jw, "description", "the dbDb table is a count of UCSC 'database' browser instances"); jsonWriteNumber(jw, "itemCount", dbDbCount); jsonWriteObjectEnd(jw); jsonWriteObjectStart(jw, "asmSummary"); -jsonWriteString(jw, "description:", "the asmSummary is the contents of the NCBI genbank/refseq assembly_summary.txt information"); -jsonWriteString(jw, "seeAlso:", "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/README_assembly_summary.txt"); +jsonWriteString(jw, "description", "the asmSummary is the contents of the NCBI genbank/refseq assembly_summary.txt information"); +jsonWriteString(jw, "seeAlso", "https://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/README_assembly_summary.txt"); jsonWriteNumber(jw, "asmSummaryTotal", asmSummaryTotal); asmSummaryGroup(conn, jw, "refseqCategory"); asmSummaryGroup(conn, jw, "versionStatus"); asmSummaryGroup(conn, jw, "assemblyLevel"); asmSummaryGroup(conn, jw, "releaseType"); asmSummaryGroup(conn, jw, "genomeRep"); asmSummaryGroup(conn, jw, "pairedAsmComp"); asmSummaryGroup(conn, jw, "assemblyType"); asmSummaryGroup(conn, jw, "phyloGroup"); jsonWriteObjectEnd(jw); } static void elapsedTime(struct jsonWrite *jw) { long nowTime = clock1000();