f8058232ef369389e7bca4eac46901e8eca76c9a galt Fri Mar 12 13:29:57 2021 -0800 Improvements to hgBlat to tolerate and explain limitations of using dynamic blat servers with the Search Alll feature. refs #26658 diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c index ea57b9d..e0f258b 100644 --- src/hg/hgBlat/hgBlat.c +++ src/hg/hgBlat/hgBlat.c @@ -268,36 +268,43 @@ for (pfd = pfdRunning; pfd; pfd = pfd->next) { // unfinished query char temp[1024]; safef(temp, sizeof temp, "Timeout %d milliseconds exceeded processing %s %s", maxTimeInMilliseconds, pfd->genome, pfd->db); pfd->networkErrMsg = cloneString(temp); pfd->error = TRUE; ++errCount; } for (pfd = pfdDone; pfd; pfd = pfd->next) { // some done queries may have errors if (pfd->error) ++errCount; } +slCat(pfdDone, pfdRunning); +pfdRunning = NULL; +slCat(pfdDone, pfdNeverStarted); +pfdNeverStarted = NULL; pthread_mutex_unlock( &pfdMutex ); return errCount; } // ================== +int nonHubDynamicBlatServerCount = 0; +int hubDynamicBlatServerCount = 0; + struct serverTable /* Information on a server. */ { char *db; /* Database name. */ char *genome; /* Genome name. */ boolean isTrans; /* Is tranlated to protein? */ char *host; /* Name of machine hosting server. */ char *port; /* Port that hosts server. */ char *nibDir; /* Directory of sequence files. */ int tileSize; /* gfServer -tileSize */ int stepSize; /* gfServer -stepSize */ int minMatch; /* gfServer -minMatch */ boolean isDynamic; /* is a dynamic server */ char* genomeDataDir; /* genome name for dynamic gfServer */ }; @@ -348,30 +355,31 @@ "here " "to reset to default database.", (isTrans ? "translated" : "DNA"), db, cartSidUrlString(cart), hDefaultDb()); } st->db = cloneString(row[0]); st->genome = cloneString(row[1]); st->isTrans = atoi(row[2]); st->host = cloneString(row[3]); st->port = cloneString(row[4]); st->nibDir = hReplaceGbdbSeqDir(row[5], st->db); if (atoi(row[6])) { st->isDynamic = TRUE; st->genomeDataDir = cloneString(st->db); // directories by database name for database genomes + ++nonHubDynamicBlatServerCount; } sqlFreeResult(&sr); hDisconnectCentral(&conn); return st; } static struct serverTable *trackHubServerTable(char *db, boolean isTrans) /* Load blat table for a hub */ { char *host, *port; char *genomeDataDir; if (!trackHubGetBlatParams(db, isTrans, &host, &port, &genomeDataDir)) errAbort("Cannot get blat server parameters for track hub with database %s", db); @@ -382,30 +390,31 @@ st->db = cloneString(db); st->genome = cloneString(hGenome(db)); st->isTrans = isTrans; st->host = host; st->port = port; struct trackHubGenome *genome = trackHubGetGenome(db); st->nibDir = cloneString(genome->twoBitPath); char *ptr = strrchr(st->nibDir, '/'); // we only want the directory name if (ptr != NULL) *ptr = 0; if (genomeDataDir != NULL) { st->isDynamic = TRUE; st->genomeDataDir = cloneString(genomeDataDir); + ++hubDynamicBlatServerCount; } return st; } struct serverTable *findServer(char *db, boolean isTrans) /* Return server for given database. Db can either be * database name or description. */ { if (trackHubDatabase(db)) return trackHubServerTable(db, isTrans); else return databaseServerTable(db, isTrans); } void findClosestServer(char **pDb, char **pOrg) @@ -887,59 +896,67 @@ else { safef(shortName, sizeof shortName, "blat %s+%d", names->name, count - 1); safef(description, sizeof description, "blat on %d queries (%s, %s, ...)", count, names->name, names->next->name); } *pName = makeNameUnique(shortName, database, cart); *pDescription = cloneString(description); } void queryServer(char *host, char *port, char *db, struct dnaSeq *seq, char *type, char *xType, boolean complex, boolean isProt, boolean queryRC, int seqNumber, char *genomeDataDir) /* Send simple query to server and report results. (no, it doesn't do this) * queryRC is true when the query has been reverse-complemented */ { -/* - * xinetd throttles by refusing more connections, which causes queries to fail - * when the configured limit is reached. Rather than trying to throttle in the - * client, dynamic servers are excluded. See issue #26658. - */ -if (genomeDataDir != NULL) - return; struct genomeHits *gH; AllocVar(gH); gH->host=cloneString(host); gH->port=cloneString(port); gH->db = cloneString(db); gH->genome = cloneString(hGenome(db)); gH->seqNumber = seqNumber; gH->faName = cloneString(seq->name); gH->dna = cloneString(seq->dna); gH->dnaSize = seq->size; gH->type = cloneString(type); gH->xType = cloneString(xType); gH->queryRC = queryRC; gH->complex = complex; gH->isProt = isProt; gH->isDynamic = (genomeDataDir != NULL); gH->genomeDataDir = genomeDataDir; gH->dbg = dyStringNew(256); + +/* SKIP DYNAMIC SERVERS + * xinetd throttles by refusing more connections, which causes queries to fail + * when the configured limit is reached. Rather than trying to throttle in the + * client, dynamic servers are excluded. See issue #26658. + */ +if (gH->isDynamic) + { + gH->error = TRUE; + gH->networkErrMsg = cloneString("Skipped Dynamic Server"); + slAddHead(&pfdDone, gH); + } +else + { slAddHead(&pfdList, gH); } +} void findBestGene(struct genomeHits *gH, int queryFrame) /* Find best gene-like object with multiple linked-features. * Remember chrom start end of best gene found and total hits in the gene. * Should sort the gfResults by tStrand, chrom, tStart. * Filters on queryFrame */ { char *bestChrom = NULL; int bestHits = 0; int bestTStart = 0; int bestTEnd = 0; int bestExons = 0; char bestTStrand = ' '; char bestQStrand = ' '; @@ -1736,31 +1753,31 @@ "document.mainForm.submit();"; char *userSeq = NULL; char *type = NULL; printf( "
\n" "

BLAT Search Genome

\n"); cartSaveSession(cart); puts("\n"); puts("\n"); puts("\n"); printf("\n"); printf(""); // clicking on the Search ALL text clicks the checkbox. jsOnEventById("click", "searchAllText", "document.mainForm.allGenomes.click();" "return false;" // cancel the default ); printf(""); printf(""); printf(""); printf(""); printf(""); printf("\n"); printf("\n"); @@ -1817,32 +1834,35 @@ puts("
Genome:"); printf(" "); -printf(" Search all"); +printf(" Search all genomes on dedicated BLAT servers."); printf("Assembly:Query type:Sort output:Output type: 
\n"); puts("
File Upload: "); puts("Rather than pasting a sequence, you can choose to upload a text file containing " "the sequence.
"); puts("Upload sequence: "); puts("

\n"); printf("%s", "

Only DNA sequences of 25,000 or fewer bases and protein or translated \n" "sequence of 10000 or fewer letters will be processed. Up to 25 sequences\n" "can be submitted at the same time. The total limit for multiple sequence\n" "submissions is 50,000 bases or 25,000 letters.
A valid example " "is GTCCTCGGAACCAGGACCTCGGCGTGGCCTAGCG (human SOD1).\n

\n"); printf("%s", -"

The Search all checkbox allows you to search all\n" -"genomes at the same time. It will query the default assembly of every organism and BLAT servers of attached hubs.\n"); +"

The Search all checkbox allows you to search all genomes at the same time. " +"Search all is only available for default assemblies and attached hubs with dedicated BLAT servers." +"The new dynamic BLAT servers enable searching on unlimited numbers of genomes " +"using a fixed amount of memory. However, because of the time required to swap, those assemblies are skipped, " +"and do not appear in the output.\n"); printf("

The All Results checkbox disables minimum matches filtering so all results are seen." " For example, with a human dna search, 20 is minimum matches required, based on the genome size, to filter out lower-quality results.\n" "This checkbox can be useful with short queries and with the tiny genomes of microorganisms. \n" ); if (hgPcrOk(db)) printf("

For locating PCR primers, use In-Silico PCR" " for best results instead of BLAT.

", db); puts("
\n"); printf("
\n"); @@ -1916,31 +1936,31 @@ struct genomeHits* gH2 = NULL; for (;gH1; gH1 = gH2->next) { gH2 = gH1->next; if (!gH2) errAbort("Hiding weaker of pairs found one without sibling."); if (!((gH1->seqNumber == gH2->seqNumber) && sameString(gH1->db, gH2->db) && (gH1->queryRC != gH2->queryRC))) errAbort("Error matching pairs, sibling does not match seqNumber and db."); // check if one or the other had an error if (gH1->error && gH2->error) gH2->hide = TRUE; // arbitrarily else if (gH1->error && !gH2->error) gH1->hide = TRUE; else if (!gH1->error && gH2->error) gH2->hide = TRUE; - else // keep the best scoring or the pair, hide the other + else // keep the best scoring of the pair, hide the other { if (gH2->maxGeneHits > gH1->maxGeneHits) gH1->hide = TRUE; else gH2->hide = TRUE; } } } void changeMaxGenePositionToPositiveStrandCoords(struct genomeHits *gH) /* convert negative strand coordinates to positive strand coordinates if TStrand=='-' */ { for (;gH; gH = gH->next) { if (gH->hide) @@ -2095,30 +2115,33 @@ cartWebEnd(); } else { if (allGenomes) { cartWebStart(cart, db, "All Genomes BLAT Results"); struct dbDb *dbList = hGetBlatIndexedDatabases(); struct dbDb *this = NULL; char *saveDb = db; char *saveOrg = organism; struct sqlConnection *conn = hConnectCentral(); int dbCount = 0; + nonHubDynamicBlatServerCount = 0; + hubDynamicBlatServerCount = 0; + for(this = dbList; this; this = this->next) { db = this->name; organism = hGenome(db); if (!trackHubDatabase(db)) // if not hub db, make sure it is the default assembly. { char query[256]; sqlSafef(query, sizeof query, "select name from defaultDb where genome='%s'", organism); char *defaultDb = sqlQuickString(conn, query); if (!sameOk(defaultDb, db)) continue; // skip non-default dbs } blatSeq(skipLeadingSpaces(userSeq), organism, db, dbCount); @@ -2264,35 +2287,48 @@ } printf("\n"); jsOnEventByIdF("click", id, "document.mainForm.org.value=\"%s\";" // some have single-quotes in their value. "document.mainForm.db.value='%s';" "document.mainForm.submit();" "return false;" // cancel the default link url , gH->genome, gH->db ); idCount++; } printf("\n"); } - printf("

\n"); + printf("
\n"); if (debuggingGfResults) printDebugging(); + if (hubDynamicBlatServerCount > 0 || nonHubDynamicBlatServerCount > 0) + { + printf("Dedicated static BLAT servers are fast but require lots of memory, processors and machines.
\n" + "Dynamic BLAT servers only require disk space, and can support an unlimited numbers of genomes,
\n" + "however they take time to swap indexes into memory and have limited parallelism.
\n" + "The BLAT All Genomes feature does not currently support dynamic BLAT servers.
\n"); + if (nonHubDynamicBlatServerCount > 0) + printf( "Number of dynamic BLAT genomes at this site: %d
\n", nonHubDynamicBlatServerCount); + if (hubDynamicBlatServerCount > 0) + printf( "Number of dynamic BLAT genomes on attached hubs: %d
\n", hubDynamicBlatServerCount); + } + printf( "
\n"); + fakeAskForSeqForm(organism, db); } else { printf("No input sequences provided.

\n"); } cartWebEnd(); } else blatSeq(skipLeadingSpaces(userSeq), organism, db, 0); } } /* Null terminated list of CGI Variables we don't want to save