7d0af5fc6dbc224c4131e6d6a43b09a8f661502b galt Thu Oct 8 23:09:56 2020 -0700 Improving hgBlat by moving the gfServer connect from being done up front to being done in the per-thread routine. This means it should use fewer connections in worse case scenarios, and also mean that gfServer will not have to wait as long. diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c index 1d14f95..2251b88 100644 --- src/hg/hgBlat/hgBlat.c +++ src/hg/hgBlat/hgBlat.c @@ -53,30 +53,32 @@ int tStart; /* Target Start Coordinate */ int tEnd; /* Target End Coordinate */ int numHits; /* number of tile hits, minimum 2 for dna */ char tStrand; /* + or - Target Strand used with prot, rnax, dnax */ int tFrame; /* Target Frame 0,1,2 (mostly ignorable?) used with prot, rnax, dnax */ int qFrame; /* Query Frame 0,1,2 (mostly ignorable?) used with rnax, dnax*/ char qStrand; /* + or - Query Strand used with prot, rnax, dnax, given by caller rather than returned by gfServer. */ }; struct genomeHits /* Information about hits on a genome assembly */ { struct genomeHits *next; + char *host; /* Host. */ + char *port; /* Port. */ char *db; /* Database name. */ char *genome; /* Genome name. */ int seqNumber; /* Submission order */ char *faName; /* fasta name */ char *dna; /* query dna */ int dnaSize; /* query dna size */ int sd; /* Connection */ char *type; /* query type = query, protQuery, transQuery */ char *xType; /* query type = dna, prot, rnax, dnax */ boolean queryRC; /* is the query reverse-complemented */ boolean complex; /* is the query complex */ boolean isProt; /* is the protein query */ int maxGeneHits; /* Highest gene hit-count */ char *maxGeneChrom; /* Target Chrom for gene with max gene hits */ @@ -832,55 +834,52 @@ else if (count == 2) { safef(shortName, sizeof shortName, "blat %s+%d", names->name, count - 1); safef(description, sizeof description, "blat on %d queries (%s, %s)", count, names->name, names->next->name); } else { safef(shortName, sizeof shortName, "blat %s+%d", names->name, count - 1); safef(description, sizeof description, "blat on %d queries (%s, %s, ...)", count, names->name, names->next->name); } *pName = makeNameUnique(shortName, database, cart); *pDescription = cloneString(description); } -void queryServer(int conn, char *db, struct dnaSeq *seq, char *type, char *xType, +void queryServer(char *host, char *port, char *db, struct dnaSeq *seq, char *type, char *xType, boolean complex, boolean isProt, boolean queryRC, int seqNumber) /* Send simple query to server and report results. * queryRC is true when the query has been reverse-complemented */ { struct genomeHits *gH; AllocVar(gH); +gH->host=cloneString(host); +gH->port=cloneString(port); gH->db = cloneString(db); gH->genome = cloneString(hGenome(db)); gH->seqNumber = seqNumber; gH->faName = cloneString(seq->name); gH->dna = cloneString(seq->dna); gH->dnaSize = seq->size; gH->type = cloneString(type); gH->xType = cloneString(xType); gH->queryRC = queryRC; gH->complex = complex; gH->isProt = isProt; -gH->sd = conn; -if (gH->sd == -1) - { - gH->error = TRUE; - gH->networkErrMsg = "Connection to gfServer failed."; - } + gH->dbg = dyStringNew(256); slAddHead(&pfdList, gH); } void findBestGene(struct genomeHits *gH, int queryFrame) /* Find best gene-like object with multiple linked-features. * Remember chrom start end of best gene found and total hits in the gene. * Should sort the gfResults by tStrand, chrom, tStart. * Filters on queryFrame */ { char *bestChrom = NULL; int bestHits = 0; int bestTStart = 0; int bestTEnd = 0; int bestExons = 0; @@ -963,47 +962,67 @@ return; int qFactor = 3; int tFactor = 3; if (gH->isProt) qFactor = 1; struct gfResult *gfR = NULL; for(gfR=gH->gfList; gfR; gfR=gfR->next) { gfR->qStart = gfR->qStart * qFactor + gfR->qFrame; gfR->qEnd = gfR->qEnd * qFactor + gfR->qFrame; gfR->tStart = gfR->tStart * tFactor + gfR->tFrame; gfR->tEnd = gfR->tEnd * tFactor + gfR->tFrame; } } +int gfConnectEx(char *host, char *port) +/* Try to connect to gfServer */ +{ +int conn = -1; +if (allGenomes) + conn = gfMayConnect(host, port); // returns -1 on failure +else + conn = gfConnect(host, port); // errAborts on failure. +return conn; +} + + void queryServerFinish(struct genomeHits *gH) /* Report results from gfServer. */ { char buf[256]; int matchCount = 0; +gH->sd = gfConnectEx(gH->host, gH->port); +if (gH->sd == -1) + { + gH->error = TRUE; + gH->networkErrMsg = "Connection to gfServer failed."; + return; + } + dyStringPrintf(gH->dbg,"query strand %s qsize %d<br>\n", gH->queryRC ? "-" : "+", gH->dnaSize); /* Put together query command. */ safef(buf, sizeof buf, "%s%s %d", gfSignature(), gH->type, gH->dnaSize); mustWriteFd(gH->sd, buf, strlen(buf)); if (read(gH->sd, buf, 1) < 0) errAbort("queryServerFinish: read failed: %s", strerror(errno)); if (buf[0] != 'Y') errAbort("Expecting 'Y' from server, got %c", buf[0]); -mustWriteFd(gH->sd, gH->dna, gH->dnaSize); +mustWriteFd(gH->sd, gH->dna, gH->dnaSize); // Cannot shifted earlier for speed. must wait for Y confirmation. if (gH->complex) { char *s = netRecieveString(gH->sd, buf); if (!s) errAbort("expected response from gfServer with tileSize"); dyStringPrintf(gH->dbg,"%s<br>\n", s); // from server: tileSize 4 } for (;;) { if (netGetString(gH->sd, buf) == NULL) break; if (sameString(buf, "end")) { @@ -1210,41 +1229,30 @@ } } gH->maxGeneHits /= 3; // average over 3 frames. char qStrand = (gH->queryRC ? '-' : '+'); safef(gH->maxGeneStrand, sizeof gH->maxGeneStrand, "%c%c", qStrand, gH->maxGeneTStrand); } close(gH->sd); } -int gfConnectEx(char *host, char *port) -/* Try to connect to gfServer */ -{ -int conn = -1; -if (allGenomes) - conn = gfMayConnect(host, port); // returns -1 on failure -else - conn = gfConnect(host, port); // errAborts on failure. -return conn; -} - int findMinMatch(long genomeSize, boolean isProt) // Return default minMatch for genomeSize, // the expected number of occurrences of string length k // in random genome of size N = N/(4^k) { int alphaBetSize; if (isProt) { alphaBetSize = 20; genomeSize = genomeSize / 3; } else { alphaBetSize = 4; } @@ -1576,72 +1584,83 @@ seq->name, oneSize, minSuggested); // we could use "continue;" here to actually enforce skipping, // but let's give the short sequence a chance, it might work. // minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein. if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0 continue; } totalSize += oneSize; if (totalSize > maxTotalSize) { warn("Sequence %s would take us over the %d letter limit, stopping here.", seq->name, maxTotalSize); break; } - conn = gfConnectEx(serve->host, serve->port); - if (isTx) { gvo->reportTargetStrand = TRUE; if (isTxTx) { if (allGenomes) - queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, FALSE, seqNumber); + queryServer(serve->host, serve->port, db, seq, "transQuery", xType, TRUE, FALSE, FALSE, seqNumber); else + { + conn = gfConnectEx(serve->host, serve->port); gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5, tFileCache, gvo, !txTxBoth); + } if (txTxBoth) { reverseComplement(seq->dna, seq->size); - conn = gfConnectEx(serve->host, serve->port); if (allGenomes) - queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, TRUE, seqNumber); + queryServer(serve->host, serve->port, db, seq, "transQuery", xType, TRUE, FALSE, TRUE, seqNumber); else + { + conn = gfConnectEx(serve->host, serve->port); gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5, tFileCache, gvo, FALSE); } } + } else { if (allGenomes) - queryServer(conn, db, seq, "protQuery", xType, TRUE, TRUE, FALSE, seqNumber); + queryServer(serve->host, serve->port, db, seq, "protQuery", xType, TRUE, TRUE, FALSE, seqNumber); else + { + conn = gfConnectEx(serve->host, serve->port); gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo); } } + } else { if (allGenomes) - queryServer(conn, db, seq, "query", xType, FALSE, FALSE, FALSE, seqNumber); + queryServer(serve->host, serve->port, db, seq, "query", xType, FALSE, FALSE, FALSE, seqNumber); else + { + conn = gfConnectEx(serve->host, serve->port); gfAlignStrand(&conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo); + } reverseComplement(seq->dna, seq->size); - conn = gfConnectEx(serve->host, serve->port); if (allGenomes) - queryServer(conn, db, seq, "query", xType, FALSE, FALSE, TRUE, seqNumber); + queryServer(serve->host, serve->port, db, seq, "query", xType, FALSE, FALSE, TRUE, seqNumber); else + { + conn = gfConnectEx(serve->host, serve->port); gfAlignStrand(&conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo); } + } gfOutputQuery(gvo, f); ++seqNumber; } carefulClose(&f); if (!allGenomes) { showAliPlaces(pslTn.forCgi, faTn.forCgi, NULL, serve->db, qType, tType, organism, feelingLucky); } if(!feelingLucky && !allGenomes) cartWebEnd(); gfFileCacheFree(&tFileCache);