03ad0cc3681e3a08fa2988990bdea9262faddc24
galt
Mon May 18 00:31:03 2020 -0700
Added All Results checkbox to hgBlat. Changed it to read tileSize etc from gfserver status so it can use whatever it finds. Correctly shows lengths and required lengths. Instead of limit 20 for dna it now detects min psl->match from genome size. Error messages and warnings should now be complete and correct about suggested and required minimum query lengths. In particular this is motivated by allowing extremely sensitive and short protein and dna queries on Sars-Cov2 (wuhCor1) as well as other microorganisms. refs #25477.
diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c
index 58331fb..d3ec464 100644
--- src/hg/hgBlat/hgBlat.c
+++ src/hg/hgBlat/hgBlat.c
@@ -22,36 +22,36 @@
#include "blatServers.h"
#include "web.h"
#include "hash.h"
#include "botDelay.h"
#include "trashDir.h"
#include "trackHub.h"
#include "hgConfig.h"
#include "errCatch.h"
#include "portable.h"
#include "portable.h"
#include "dystring.h"
#include "chromInfo.h"
#include "net.h"
#include "fuzzyFind.h"
-
struct cart *cart; /* The user's ui state. */
struct hash *oldVars = NULL;
boolean orgChange = FALSE;
boolean dbChange = FALSE;
boolean allGenomes = FALSE;
+boolean allResults = FALSE;
struct gfResult
/* Detailed gfServer results, this is a span of several nearby tiles, minimum 2 for dna. */
{
struct gfResult *next;
/* have to multiply translated coordinates by 3 */
int qStart; /* Query Start Coordinate */
int qEnd; /* Query End Coordinate */
char *chrom; /* Target Chrom Name */
int tStart; /* Target Start Coordinate */
int tEnd; /* Target End Coordinate */
int numHits; /* number of tile hits, minimum 2 for dna */
char tStrand; /* + or - Target Strand used with prot, rnax, dnax */
int tFrame; /* Target Frame 0,1,2 (mostly ignorable?) used with prot, rnax, dnax */
@@ -280,40 +280,39 @@
pthread_mutex_unlock( &pfdMutex );
return errCount;
}
// ==================
struct serverTable
/* Information on a server. */
{
char *db; /* Database name. */
char *genome; /* Genome name. */
boolean isTrans; /* Is tranlated to protein? */
char *host; /* Name of machine hosting server. */
char *port; /* Port that hosts server. */
char *nibDir; /* Directory of sequence files. */
+ int tileSize; /* gfServer -tileSize */
+ int stepSize; /* gfServer -stepSize */
+ int minMatch; /* gfServer -minMatch */
};
char *typeList[] = {"BLAT's guess", "DNA", "protein", "translated RNA", "translated DNA"};
char *outputList[] = {"hyperlink", "psl", "psl no header"};
-#ifdef LOWELAB
-int minMatchShown = 14;
-#else
-int minMatchShown = 20;
-#endif
+int minMatchShown = 0;
static struct serverTable *trackHubServerTable(char *db, boolean isTrans)
/* Find out if database is a track hub with a blat server */
{
char *host, *port;
if (!trackHubGetBlatParams(db, isTrans, &host, &port))
return NULL;
struct serverTable *st;
AllocVar(st);
st->db = cloneString(db);
st->genome = cloneString(hGenome(db));
@@ -345,33 +344,33 @@
struct sqlConnection *conn = hConnectCentral();
char query[256];
struct sqlResult *sr;
char **row;
char dbActualName[32];
/* If necessary convert database description to name. */
sqlSafef(query, sizeof(query), "select name from dbDb where name = '%s'", db);
if (!sqlExists(conn, query))
{
sqlSafef(query, sizeof(query), "select name from dbDb where description = '%s'", db);
if (sqlQuickQuery(conn, query, dbActualName, sizeof(dbActualName)) != NULL)
db = dbActualName;
}
-/* Do a little join to get data to fit into the serverTable. */
-sqlSafef(query, sizeof(query), "select dbDb.name,dbDb.description,blatServers.isTrans"
- ",blatServers.host,blatServers.port,dbDb.nibPath "
+/* Do a little join to get data to fit into the serverTable and grab dbDb.nibPath too. */
+sqlSafef(query, sizeof(query), "select dbDb.name,dbDb.description,blatServers.isTrans,"
+ "blatServers.host,blatServers.port,dbDb.nibPath "
"from dbDb,blatServers where blatServers.isTrans = %d and "
"dbDb.name = '%s' and dbDb.name = blatServers.db",
isTrans, db);
sr = sqlGetResult(conn, query);
if ((row = sqlNextRow(sr)) == NULL)
{
errAbort("Can't find a server for %s database %s. Click "
"here "
"to reset to default database.",
(isTrans ? "translated" : "DNA"), db,
cartSidUrlString(cart), hDefaultDb());
}
st.db = cloneString(row[0]);
st.genome = cloneString(row[1]);
st.isTrans = atoi(row[2]);
@@ -493,54 +492,52 @@
enum gfType qType, enum gfType tType,
char *organism, boolean feelingLucky)
/* Show all the places that align. */
{
boolean useBigPsl = cfgOptionBooleanDefault("useBlatBigPsl", TRUE);
struct lineFile *lf = pslFileOpen(pslName);
struct psl *pslList = NULL, *psl;
char *browserUrl = hgTracksName();
char *hgcUrl = hgcName();
char uiState[64];
char *vis;
char unhideTrack[64];
char *sort = cartUsualString(cart, "sort", pslSortList[0]);
char *output = cartUsualString(cart, "output", outputList[0]);
boolean pslOut = startsWith("psl", output);
-boolean isStraightNuc = (qType == gftRna || qType == gftDna);
-int minThreshold = (isStraightNuc ? minMatchShown : 0);
sprintf(uiState, "%s=%s", cartSessionVarName(), cartSessionId(cart));
/* If user has hidden BLAT track, add a setting that will unhide the
track if user clicks on a browser link. */
vis = cartOptionalString(cart, "hgUserPsl");
if (vis != NULL && sameString(vis, "hide"))
snprintf(unhideTrack, sizeof(unhideTrack), "&hgUserPsl=dense");
else
unhideTrack[0] = 0;
while ((psl = pslNext(lf)) != NULL)
{
- if (psl->match >= minThreshold)
+ if (psl->match >= minMatchShown)
slAddHead(&pslList, psl);
}
lineFileClose(&lf);
if (pslList == NULL)
{
printf("
Sorry, no matches found");
- if (isStraightNuc)
- printf(" (with score at least %d)", minThreshold);
+ if (!allResults)
+ printf(" (with score at least %d)", minMatchShown);
printf("
| |
\n");
return;
}
pslSortListByVar(&pslList, sort);
if(feelingLucky)
{
/* If we found something jump browser to there. */
if(slCount(pslList) > 0)
printLuckyRedirect(browserUrl, pslList, database, pslName, faName, uiState, unhideTrack);
/* Otherwise call ourselves again not feeling lucky to print empty
results. */
else
{
@@ -890,55 +887,46 @@
char bestTStrand = ' ';
char bestQStrand = ' ';
char *thisChrom = NULL;
int thisHits = 0;
int thisTStart = 0;
int thisTEnd = 0;
int thisExons = 0;
char thisTStrand = ' ';
char thisQStrand = ' ';
int geneGap = ffIntronMaxDefault; // about a million bases is a cutoff for genes.
int qSlop = 5; // forgive 5 bases, about dna stepSize = 5.
if (gH->complex)
- qSlop = 4; // reduce for translated with tileSize/stepSize = 4.
+ qSlop = 4; // reduce for translated with stepSize = 4.
struct gfResult *gfR = NULL, *gfLast = NULL;
for(gfR=gH->gfList; gfR; gfR=gfR->next)
{
// filter on given queryFrame
if (gfR->qFrame != queryFrame)
continue;
if (gfLast && (gfR->tStrand == gfLast->tStrand) && sameString(gfR->chrom,gfLast->chrom) &&
(gfR->qStart >= (gfLast->qEnd - qSlop)) && (gfR->tStart >= gfLast->tEnd) && ((gfR->tStart - gfLast->tEnd) < geneGap))
{
thisHits += gfR->numHits;
thisTEnd = gfR->tEnd;
thisExons += 1;
-
- //if (gH->complex)
- //{
- // gfR->tStrand, gfR->tFrame
- //if (gH->complex)
- //{
- //gfR->qFrame
- //}
- //}
}
else
{
thisHits = gfR->numHits;
thisChrom = gfR->chrom;
thisTStart = gfR->tStart;
thisTEnd = gfR->tEnd;
thisExons = 1;
thisTStrand = gfR->tStrand;
thisQStrand = gfR->qStrand;
}
if (thisHits > bestHits)
{
bestHits = thisHits;
bestExons = thisExons;
@@ -1072,31 +1060,31 @@
}
gfR->tStart = sqlUnsigned(word[3]); // e.g. 173515
gfR->tEnd = sqlUnsigned(word[4]); // e.g. 173586
numHits = sqlUnsigned(word[5]); // e.g. 14
// Frustrated with weird little results with vastly exaggerated hit-score,
// I have flattened that out so it maxes out at #tiles that could fit
// It seems to still work just fine. I am going to leave it for now.
// One minor note, by suppressing extra scores for short exons,
// it will not prefer alignments with the same number of hits, but more exons.
if (!gH->complex) // dna xType
{
// maximum tiles that could fit in qSpan
- int limit = ((gfR->qEnd - gfR->qStart) - 6)/5;
+ int limit = ((gfR->qEnd - gfR->qStart) - 6)/5; // stepSize=5
++limit; // for a little extra.
if (numHits > limit)
numHits = limit;
}
gfR->numHits = numHits;
if (gH->complex)
{
gfR->tStrand = word[6][0]; // e.g. + or -
gfR->tFrame = sqlUnsigned(word[7]); // e.g. 0,1,2
if (!gH->isProt)
{
gfR->qFrame = sqlUnsigned(word[8]); // e.g. 0,1,2
}
@@ -1233,38 +1221,144 @@
close(gH->sd);
}
int gfConnectEx(char *host, char *port)
/* Try to connect to gfServer */
{
int conn = -1;
if (allGenomes)
conn = gfMayConnect(host, port); // returns -1 on failure
else
conn = gfConnect(host, port); // errAborts on failure.
return conn;
}
+int findMinMatch(long genomeSize, boolean isProt)
+// Return default minMatch for genomeSize,
+// the expected number of occurrences of string length k
+// in random genome of size N = N/(4^k)
+{
+int alphaBetSize;
+if (isProt)
+ {
+ alphaBetSize = 20;
+ genomeSize = genomeSize / 3;
+ }
+else
+ {
+ alphaBetSize = 4;
+ }
+int k = 1;
+double expected = genomeSize;
+for (k=1; k<36; k++)
+ {
+ expected /= alphaBetSize;
+ if (expected < .004)
+ break;
+ }
+return k;
+}
+
+long findGenomeSize(char *database)
+// get genomeSize from database.
+{
+struct sqlConnection *conn = hAllocConn(database);
+char query[256];
+sqlSafef(query, sizeof query, "select sum(size) from chromInfo");
+long genomeSize = sqlQuickLongLong(conn, query);
+hFreeConn(&conn);
+if (genomeSize == 0)
+ {
+ warn("Genome Size not found for %s", database);
+ genomeSize = 3272116950; // substitute human genome size
+ }
+return genomeSize;
+}
+
+long findGenomeSizeFromHub(char *database)
+// fetch genome size by adding up chroms from hub 2bit.
+{
+struct trackHubGenome *genome = trackHubGetGenome(database);
+
+genome->tbf = twoBitOpen(genome->twoBitPath);
+
+long genomeSize = 0;
+struct twoBitIndex *index;
+for (index = genome->tbf->indexList; index != NULL; index = index->next)
+ {
+ genomeSize += twoBitSeqSize(genome->tbf, index->name);
+ }
+
+twoBitClose(&genome->tbf);
+
+return genomeSize;
+}
+
+
+int findGenomeParams(struct serverTable *serve)
+/* Send status message to server arnd report result.
+ * Get tileSize stepSize and minMatch.
+ */
+
+{
+char buf[256];
+int sd = 0;
+int ret = 0;
+
+/* Put together command. */
+sd = gfConnectEx(serve->host, serve->port);
+sprintf(buf, "%sstatus", gfSignature());
+mustWriteFd(sd, buf, strlen(buf));
+
+for (;;)
+ {
+ if (netGetString(sd, buf) == NULL)
+ {
+ warn("Error reading status information from %s:%s",serve->host, serve->port);
+ ret = -1;
+ break;
+ }
+ if (sameString(buf, "end"))
+ break;
+ else
+ {
+ if (startsWith("tileSize ", buf))
+ {
+ serve->tileSize = atoi(buf+strlen("tileSize "));
+ }
+ if (startsWith("stepSize ", buf))
+ {
+ serve->stepSize = atoi(buf+strlen("stepSize "));
+ }
+ if (startsWith("minMatch ", buf))
+ {
+ serve->minMatch = atoi(buf+strlen("minMatch "));
+ }
+ }
+ }
+close(sd);
+return(ret);
+}
+
void blatSeq(char *userSeq, char *organism, char *database, int dbCount)
/* Blat sequence user pasted in. */
{
FILE *f;
struct dnaSeq *seqList = NULL, *seq;
struct tempName pslTn, faTn;
int maxSingleSize, maxTotalSize, maxSeqCount;
-int minSingleSize = minMatchShown;
char *genome, *db;
char *type = cgiString("type");
char *seqLetters = cloneString(userSeq);
struct serverTable *serve;
int conn;
int oneSize, totalSize = 0, seqCount = 0;
boolean isTx = FALSE;
boolean isTxTx = FALSE;
boolean txTxBoth = FALSE;
struct gfOutput *gvo;
boolean qIsProt = FALSE;
enum gfType qType, tType;
struct hash *tFileCache = gfFileCacheNew();
// allGenomes ignores I'm Feeling Lucky for simplicity
boolean feelingLucky = cgiBoolean("Lucky") && !allGenomes;
@@ -1357,122 +1451,171 @@
maxSeqCount = 25;
#endif
char *optionMaxSeqCount = cfgOptionDefault("hgBlat.maxSequenceCount", NULL);
if (isNotEmpty(optionMaxSeqCount))
maxSeqCount = sqlSigned(optionMaxSeqCount);
/* Create temporary file to store sequence. */
trashDirFile(&faTn, "hgSs", "hgSs", ".fa");
faWriteAll(faTn.forCgi, seqList);
/* Create a temporary .psl file with the alignments against genome. */
trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE);
+
+
+/* Strategy for calculating minMatchShown.
+ Calculate the minimum size for filtering based on genome size.
+ The expected numberof occurrences of q = n/(4^k)
+ = N * pow(4,-k); where N is genome size, and k is minimum size required.
+ For protein, use 22 instead of 4.
+ For human, a cutoff of 20, expected number < .003 or 0.3%
+ so the probability of getting it just by chance is low.
+*/
+
+int xlat;
+
serve = findServer(db, isTx);
/* Write header for extended (possibly protein) psl file. */
if (isTx)
{
if (isTxTx)
{
qType = gftDnaX;
tType = gftDnaX;
+ xlat = 3;
}
else
{
qType = gftProt;
tType = gftDnaX;
+ xlat = 1;
}
+ serve->tileSize = 4;
+ serve->stepSize = 4;
+ serve->minMatch = 3;
}
else
{
qType = gftDna;
tType = gftDna;
+ serve->tileSize = 11;
+ serve->stepSize = 5;
+ serve->minMatch = 2;
+ xlat = 1;
}
pslxWriteHead(f, qType, tType);
-if (qType == gftProt)
+
+long genomeSize = 3272116950; // substitute human genome size
+
+// Use with message about recommended length when using short query
+// This is the minimum that can find anything at all.
+int minSuggested = 0;
+
+if (allGenomes)
+ {
+ minMatchShown = 0;
+ }
+else
+ {
+ #ifdef LOWELAB
+ minMatchShown = 14;
+ #else
+ // read genome size
+ if (trackHubDatabase(database))
{
- minSingleSize = 14;
+ genomeSize = findGenomeSizeFromHub(database);
}
-else if (qType == gftDnaX)
+ else
{
- minSingleSize = 36;
+ genomeSize = findGenomeSize(database);
+ }
+ minMatchShown = findMinMatch(genomeSize, qType == gftProt);
+ #endif
+ if (allResults)
+ minMatchShown = 0;
+
+ // read tileZize stepSize minMatch from server status
+ findGenomeParams(serve);
+
+ int minLucky = (serve->minMatch * serve->stepSize + (serve->tileSize - serve->stepSize)) * xlat;
+
+ minSuggested = max(minMatchShown,minLucky);
}
int seqNumber = 0;
/* Loop through each sequence. */
for (seq = seqList; seq != NULL; seq = seq->next)
{
printf(" "); fflush(stdout); /* prevent apache cgi timeout by outputting something */
oneSize = realSeqSize(seq, !isTx);
// Impose half the usual bot delay per sequence
if (dbCount == 0)
hgBotDelayFrac(0.5);
if (++seqCount > maxSeqCount)
{
warn("More than %d input sequences, stopping at %s
(see also: cgi-bin/hg.conf hgBlat.maxSequenceCount setting).",
maxSeqCount, seq->name);
break;
}
if (oneSize > maxSingleSize)
{
warn("Sequence %s is %d letters long (max is %d), skipping",
seq->name, oneSize, maxSingleSize);
continue;
}
- if (oneSize < minSingleSize)
+ if (oneSize < minSuggested)
{
warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)",
- seq->name, oneSize, minSingleSize);
+ seq->name, oneSize, minSuggested);
// we could use "continue;" here to actually enforce skipping,
// but let's give the short sequence a chance, it might work.
// minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein.
if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0
continue;
}
totalSize += oneSize;
if (totalSize > maxTotalSize)
{
warn("Sequence %s would take us over the %d letter limit, stopping here.",
seq->name, maxTotalSize);
break;
}
conn = gfConnectEx(serve->host, serve->port);
if (isTx)
{
gvo->reportTargetStrand = TRUE;
if (isTxTx)
{
if (allGenomes)
queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, FALSE, seqNumber);
else
- gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5,
- tFileCache, gvo, !txTxBoth);
+ gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5, tFileCache, gvo, !txTxBoth);
if (txTxBoth)
{
reverseComplement(seq->dna, seq->size);
conn = gfConnectEx(serve->host, serve->port);
if (allGenomes)
queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, TRUE, seqNumber);
else
- gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5,
- tFileCache, gvo, FALSE);
+ gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5, tFileCache, gvo, FALSE);
}
}
else
{
if (allGenomes)
queryServer(conn, db, seq, "protQuery", xType, TRUE, TRUE, FALSE, seqNumber);
else
gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo);
}
}
else
{
if (allGenomes)
queryServer(conn, db, seq, "query", xType, FALSE, FALSE, FALSE, seqNumber);
else
@@ -1510,84 +1653,101 @@
/* JavaScript to update form when org changes */
char *onChangeText = ""
"document.mainForm.changeInfo.value='orgChange';"
"document.mainForm.submit();";
char *userSeq = NULL;
char *type = NULL;
printf(
"