4023e99edfbcb1e936b98326d9f99f1650541bbd
kuhn
Thu Aug 26 18:48:15 2021 -0700
added ref to blat paper
diff --git src/hg/hgBlat/hgBlat.c src/hg/hgBlat/hgBlat.c
index 74510ef..67ee438 100644
--- src/hg/hgBlat/hgBlat.c
+++ src/hg/hgBlat/hgBlat.c
@@ -1,2376 +1,2379 @@
/* hgBlat - CGI-script to manage fast human genome sequence searching. */
/* Copyright (C) 2014 The Regents of the University of California
* See README in this or parent directory for licensing information. */
#include Go back to %s on the Genome Browser. BLAT on DNA is designed to\n"
"quickly find sequences of 95%% and greater similarity of length 25 bases or\n"
"more. It may miss more divergent or shorter sequence alignments. It will find\n"
"perfect sequence matches of 20 bases.\n"
"BLAT on proteins finds sequences of 80%% and greater similarity of length 20 amino\n"
"acids or more. In practice DNA BLAT works well on primates, and protein\n"
"BLAT on land vertebrates."
);
printf("%s",
"\n BLAT is not BLAST. DNA BLAT works by keeping an index of the entire genome\n"
"in memory. The index consists of all overlapping 11-mers stepping by 5 except for\n"
"those heavily involved in repeats. The index takes up about\n"
"2 gigabytes of RAM. RAM can be further reduced to less than 1 GB by increasing step size to 11.\n"
"The genome itself is not kept in memory, allowing\n"
"BLAT to deliver high performance on a reasonably priced Linux box.\n"
"The index is used to find areas of probable homology, which are then\n"
"loaded into memory for a detailed alignment. Protein BLAT works in a similar\n"
"manner, except with 4-mers rather than 11-mers. The protein index takes a little\n"
"more than 2 gigabytes. BLAT was written by Jim Kent.\n"
"Like most of Jim's software, interactive use on this web server is free to all.\n"
"Sources and executables to run batch jobs on your own server are available free\n"
"for academic, personal, and non-profit purposes. Non-exclusive commercial\n"
"licenses are also available. See the \n"
"Kent Informatics\n"
"website for details. For more information on the graphical version of BLAT, click the Help \n"
"button on the top menu bar");
-printf(" or see the Genome Browser FAQ.
\n");
return;
}
pslSortListByVar(&pslList, sort);
if(feelingLucky)
{
/* If we found something jump browser to there. */
if(slCount(pslList) > 0)
printLuckyRedirect(browserUrl, pslList, database, pslName, faName, uiState, unhideTrack);
/* Otherwise call ourselves again not feeling lucky to print empty
results. */
else
{
cartWebStart(cart, database, "%s (%s) BLAT Results", trackHubSkipHubName(organism), database);
showAliPlaces(pslName, faName, customText, database, qType, tType, organism, FALSE);
cartWebEnd();
}
}
else if (pslOut)
{
if (!pslRawOut)
printf("
Sorry, no matches found");
if (!allResults)
printf(" (with a score of at least %d)", minMatchShown);
printf("");
if (!sameString(output, "psl no header"))
pslxWriteHead(stdout, qType, tType);
for (psl = pslList; psl != NULL; psl = psl->next)
pslTabOut(psl, stdout);
if (pslRawOut)
exit(0);
printf("
");
printf("
");
}
else if (jsonOut)
{
webStartText();
pslWriteAllJson(pslList, stdout, database, TRUE);
exit(0);
}
else // hyperlink
{
printf("BLAT Search Results
");
char* posStr = cartOptionalString(cart, "position");
if (posStr != NULL)
printf("");
// find maximum query name size for padding calculations and
// find maximum target chrom name size for padding calculations
int maxQChromNameSize = 0;
int maxTChromNameSize = 0;
for (psl = pslList; psl != NULL; psl = psl->next)
{
int qLen = strlen(psl->qName);
maxQChromNameSize = max(maxQChromNameSize,qLen);
int tLen = strlen(psl->tName);
maxTChromNameSize = max(maxTChromNameSize,tLen);
}
maxQChromNameSize = max(maxQChromNameSize,5);
maxTChromNameSize = max(maxTChromNameSize,5);
printf(" ACTIONS QUERY ");
spaceOut(stdout, maxQChromNameSize - 5);
printf("SCORE START END QSIZE IDENTITY CHROM ");
spaceOut(stdout, maxTChromNameSize - 5);
printf(" STRAND START END SPAN\n");
printf("---------------------------------------------------------------------------------------------");
repeatCharOut(stdout, '-', maxQChromNameSize - 5);
repeatCharOut(stdout, '-', maxTChromNameSize - 5);
printf("\n");
for (psl = pslList; psl != NULL; psl = psl->next)
{
if (customText)
printf("",
browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database,
customText, uiState, unhideTrack);
else
printf("",
browserUrl, psl->tName, psl->tStart + 1, psl->tEnd, database,
pslName, faName, uiState, unhideTrack);
printf("browser ");
printf("",
hgcUrl, psl->tStart, pslName, cgiEncode(faName), psl->qName, psl->tName,
psl->tStart, psl->tEnd, database, uiState);
printf("details ");
printf("%s",psl->qName);
spaceOut(stdout, maxQChromNameSize - strlen(psl->qName));
printf(" %5d %5d %5d %5d %5.1f%% ",
pslScore(psl), psl->qStart+1, psl->qEnd, psl->qSize,
100.0 - pslCalcMilliBad(psl, TRUE) * 0.1);
printf("%s",psl->tName);
spaceOut(stdout, maxTChromNameSize - strlen(psl->tName));
printf(" %-2s %9d %9d %6d",
psl->strand, psl->tStart+1, psl->tEnd,
psl->tEnd - psl->tStart);
// if you modify this, also modify hgPcr.c:doQuery, which implements a similar feature
char *seq = psl->tName;
if (endsWith(seq, "_fix"))
printf(" What is chrom_fix?");
else if (endsWith(seq, "_alt"))
printf(" What is chrom_alt?");
else if (endsWith(seq, "_random"))
printf(" What is chrom_random?");
else if (startsWith(seq, "chrUn"))
printf(" What is a chrUn sequence?");
printf("\n");
}
printf("
\n");
webNewSection("Help");
puts("\n");
puts("
\n", gH->queryRC ? "-" : "+", gH->dnaSize);
/* Put together query command. */
if (gH->isDynamic)
safef(buf, sizeof buf, "%s%s %s %s %d", gfSignature(), gH->type,
conn->genome, conn->genomeDataDir, gH->dnaSize);
else
safef(buf, sizeof buf, "%s%s %d", gfSignature(), gH->type, gH->dnaSize);
gfBeginRequest(conn);
mustWriteFd(conn->fd, buf, strlen(buf));
if (read(conn->fd, buf, 1) < 0)
errAbort("queryServerFinish: read failed: %s", strerror(errno));
if (buf[0] != 'Y')
errAbort("Expecting 'Y' from server, got %c", buf[0]);
mustWriteFd(conn->fd, gH->dna, gH->dnaSize); // Cannot shifted earlier for speed. must wait for Y confirmation.
if (gH->complex)
{
char *s = netRecieveString(conn->fd, buf);
if (!s)
errAbort("expected response from gfServer with tileSize");
dyStringPrintf(gH->dbg,"%s
\n", s); // from server: tileSize 4
}
for (;;)
{
if (netGetString(conn->fd, buf) == NULL)
break;
if (sameString(buf, "end"))
{
dyStringPrintf(gH->dbg,"%d matches
\n", matchCount);
break;
}
else if (startsWith("Error:", buf))
{
errAbort("%s", buf);
break;
}
else
{
dyStringPrintf(gH->dbg,"%s
\n", buf);
// chop the line into words
int numHits = 0;
char *line = buf;
struct gfResult *gfR = NULL;
AllocVar(gfR);
gfR->qStrand = (gH->queryRC ? '-' : '+');
char *word[10];
int fields = chopLine(line,word);
int expectedFields = 6;
if (gH->complex)
{
if (gH->isProt)
expectedFields = 8;
else
expectedFields = 9;
}
if (fields != expectedFields)
errAbort("expected %d fields, got %d fields", expectedFields, fields);
gfR->qStart = sqlUnsigned(word[0]); // e.g. 3139
gfR->qEnd = sqlUnsigned(word[1]); // e.g. 3220
// e.g. hg38.2bit:chr1
char *colon = strchr(word[2], ':');
if (colon)
{
gfR->chrom = cloneString(colon+1);
}
else
{ // e.g. chr10.nib
char *dot = strchr(word[2], '.');
if (dot)
{
*dot = 0;
gfR->chrom = cloneString(word[2]);
}
else
errAbort("Expecting colon or period in the 3rd field of gfServer response");
}
gfR->tStart = sqlUnsigned(word[3]); // e.g. 173515
gfR->tEnd = sqlUnsigned(word[4]); // e.g. 173586
numHits = sqlUnsigned(word[5]); // e.g. 14
// Frustrated with weird little results with vastly exaggerated hit-score,
// I have flattened that out so it maxes out at #tiles that could fit
// It seems to still work just fine. I am going to leave it for now.
// One minor note, by suppressing extra scores for short exons,
// it will not prefer alignments with the same number of hits, but more exons.
if (!gH->complex) // dna xType
{
// maximum tiles that could fit in qSpan
int limit = ((gfR->qEnd - gfR->qStart) - 6)/5; // stepSize=5
++limit; // for a little extra.
if (numHits > limit)
numHits = limit;
}
gfR->numHits = numHits;
if (gH->complex)
{
gfR->tStrand = word[6][0]; // e.g. + or -
gfR->tFrame = sqlUnsigned(word[7]); // e.g. 0,1,2
if (!gH->isProt)
{
gfR->qFrame = sqlUnsigned(word[8]); // e.g. 0,1,2
}
}
else
{
gfR->tStrand = '+'; // dna search only on + target strand
}
if (gH->complex)
{
char *s = netGetLongString(conn->fd);
if (s == NULL)
break;
dyStringPrintf(gH->dbg,"%s
\n", s); //dumps out qstart1 tstart1 qstart2 tstart2 ...
freeMem(s);
}
slAddHead(&gH->gfList, gfR);
}
++matchCount;
}
slReverse(&gH->gfList);
unTranslateCoordinates(gH); // convert back to untranslated coordinates
slSort(&gH->gfList, gfResultsCmp); // sort by tStrand, chrom, tStart
struct qFrameResults
/* Information about hits on a genome assembly */
{
int maxGeneHits; /* Highest gene hit-count */
char *maxGeneChrom; /* Target Chrom for gene with max gene hits */
int maxGeneTStart; /* Target Start Coordinate for gene with max hits */
int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/
int maxGeneExons; /* Number of Exons in gene with max hits */
char maxGeneTStrand;/* + or - TStrand for gene with max hits */
};
int qFrame = 0;
int qFrameStart = 0;
int qFrameEnd = 0;
if (gH->complex && !gH->isProt) // rnax, dnax
{
qFrameEnd = 2;
}
struct qFrameResults r[3];
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
findBestGene(gH, qFrame); // find best gene-like thing with most hits
r[qFrame].maxGeneHits = gH->maxGeneHits;
r[qFrame].maxGeneChrom = cloneString(gH->maxGeneChrom);
r[qFrame].maxGeneTStart = gH->maxGeneTStart;
r[qFrame].maxGeneTEnd = gH->maxGeneTEnd;
r[qFrame].maxGeneExons = gH->maxGeneExons;
r[qFrame].maxGeneTStrand = gH->maxGeneTStrand;
}
// combine the results from all 3 qFrames
if (gH->complex && !gH->isProt) // rnax, dnax
{
int biggestHit = -1;
int bigQFrame = -1;
// find the qFrame with the biggest hits
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
if (r[qFrame].maxGeneHits > biggestHit)
{
bigQFrame = qFrame;
biggestHit = r[qFrame].maxGeneHits;
}
}
// save combined final answers back in gH
gH->maxGeneHits = 0;
gH->maxGeneTStrand = r[bigQFrame].maxGeneTStrand;
gH->maxGeneChrom = cloneString(r[bigQFrame].maxGeneChrom);
gH->maxGeneExons = r[bigQFrame].maxGeneExons;
gH->maxGeneTStart = r[bigQFrame].maxGeneTStart;
gH->maxGeneTEnd = r[bigQFrame].maxGeneTEnd;
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
// ignore frames that do not have same tStrand, chrom, and overlap with the best
if (!(
(r[qFrame].maxGeneTStrand == r[bigQFrame].maxGeneTStrand) &&
sameOk(r[qFrame].maxGeneChrom, r[bigQFrame].maxGeneChrom) &&
// overlap start,end between this and bigQFrame
(
(r[qFrame].maxGeneTEnd > r[bigQFrame].maxGeneTStart) &&
(r[bigQFrame].maxGeneTEnd > r[qFrame].maxGeneTStart)
)
))
{
continue; // incompatible with best result, skip it
}
// Add total hits
gH->maxGeneHits += r[qFrame].maxGeneHits; // should be pretty accurate
// Find biggest exon count
if (gH->maxGeneExons < r[qFrame].maxGeneExons) // often underestimates actual exon count.
{
gH->maxGeneExons = r[qFrame].maxGeneExons;
}
// Adjust tStart
if (gH->maxGeneTStart > r[qFrame].maxGeneTStart)
{
gH->maxGeneTStart = r[qFrame].maxGeneTStart;
}
// Adjust tEnd
if (gH->maxGeneTEnd < r[qFrame].maxGeneTEnd)
{
gH->maxGeneTEnd = r[qFrame].maxGeneTEnd;
}
}
gH->maxGeneHits /= 3; // average over 3 frames.
char qStrand = (gH->queryRC ? '-' : '+');
safef(gH->maxGeneStrand, sizeof gH->maxGeneStrand, "%c%c", qStrand, gH->maxGeneTStrand);
}
gfDisconnect(&conn);
}
int findMinMatch(long genomeSize, boolean isProt)
// Return default minMatch for genomeSize,
// the expected number of occurrences of string length k
// in random genome of size N = N/(4^k)
{
int alphaBetSize;
if (isProt)
{
alphaBetSize = 20;
genomeSize = genomeSize / 3;
}
else
{
alphaBetSize = 4;
}
int k = 1;
double expected = genomeSize;
for (k=1; k<36; k++)
{
expected /= alphaBetSize;
if (expected < .004)
break;
}
return k;
}
long findGenomeSize(char *database)
// get genomeSize from database.
{
struct sqlConnection *conn = hAllocConn(database);
char query[256];
sqlSafef(query, sizeof query, "select sum(size) from chromInfo");
long genomeSize = sqlQuickLongLong(conn, query);
hFreeConn(&conn);
if (genomeSize == 0)
{
warn("Genome Size not found for %s", database);
genomeSize = 3272116950; // substitute human genome size
}
return genomeSize;
}
long findGenomeSizeFromHub(char *database)
// fetch genome size by adding up chroms from hub 2bit.
{
struct trackHubGenome *genome = trackHubGetGenome(database);
genome->tbf = twoBitOpen(genome->twoBitPath);
long genomeSize = 0;
struct twoBitIndex *index;
for (index = genome->tbf->indexList; index != NULL; index = index->next)
{
genomeSize += twoBitSeqSize(genome->tbf, index->name);
}
twoBitClose(&genome->tbf);
return genomeSize;
}
int findGenomeParams(struct gfConnection *conn, struct serverTable *serve)
/* Send status message to server arnd report result.
* Get tileSize stepSize and minMatch.
*/
{
char buf[256];
int ret = 0;
/* Put together command. */
gfBeginRequest(conn);
if (serve->isDynamic)
sprintf(buf, "%s%s %s %s", gfSignature(), (serve->isTrans ? "transInfo" : "untransInfo"),
conn->genome, conn->genomeDataDir);
else
sprintf(buf, "%sstatus", gfSignature());
mustWriteFd(conn->fd, buf, strlen(buf));
for (;;)
{
if (netGetString(conn->fd, buf) == NULL)
{
long et = clock1000() - enteredMainTime;
if (serve->isDynamic)
{
if (et > NET_TIMEOUT_MS)
warn("the dynamic blat service is taking too long to respond, probably overloaded at this time, try again later. Error reading status information from %s:%s",
serve->host, serve->port);
else if (et < NET_QUICKEXIT_MS)
warn("the dynamic blat service is returning an error immediately. it is probably overloaded at this time, try again later. Error reading status information from %s:%s",
serve->host, serve->port);
else
warn("the dynamic blat service is returning an error at this time, try again later. Error reading status information from %s:%s",
serve->host, serve->port);
}
else
{
warn("Error reading status information from %s:%s; gfServer maybe down or misconfigured, see system logs for details)",
serve->host, serve->port);
}
ret = -1;
break;
}
if (sameString(buf, "end"))
break;
else
{
if (startsWith("tileSize ", buf))
{
serve->tileSize = atoi(buf+strlen("tileSize "));
}
if (startsWith("stepSize ", buf))
{
serve->stepSize = atoi(buf+strlen("stepSize "));
}
if (startsWith("minMatch ", buf))
{
serve->minMatch = atoi(buf+strlen("minMatch "));
}
}
}
gfEndRequest(conn);
return(ret);
}
void blatSeq(char *userSeq, char *organism, char *database, int dbCount)
/* Blat sequence user pasted in. */
{
FILE *f;
struct dnaSeq *seqList = NULL, *seq;
struct tempName pslTn, faTn;
int maxSingleSize, maxTotalSize, maxSeqCount;
char *genome, *db;
char *type = cgiString("type");
char *seqLetters = cloneString(userSeq);
struct serverTable *serve;
struct gfConnection *conn = NULL;
int oneSize, totalSize = 0, seqCount = 0;
boolean isTx = FALSE;
boolean isTxTx = FALSE;
boolean txTxBoth = FALSE;
struct gfOutput *gvo;
boolean qIsProt = FALSE;
enum gfType qType, tType;
struct hash *tFileCache = gfFileCacheNew();
// allGenomes ignores I'm Feeling Lucky for simplicity
boolean feelingLucky = cgiBoolean("Lucky") && !allGenomes;
char *xType = NULL;
if (allGenomes)
{
db = database;
genome = organism;
}
else
getDbAndGenome(cart, &db, &genome, oldVars);
char *output = cgiOptionalString("output");
boolean isJson= sameWordOk(output, "json");
boolean isPslRaw= sameWordOk(output, "pslRaw");
if (!feelingLucky && !allGenomes && !isJson && !isPslRaw)
cartWebStart(cart, db, "%s (%s) BLAT Results", trackHubSkipHubName(organism), db);
/* Load user sequence and figure out if it is DNA or protein. */
if (sameWord(type, "DNA"))
{
seqList = faSeqListFromMemText(seqLetters, TRUE);
uToT(seqList);
isTx = FALSE;
xType = "dna";
}
else if (sameWord(type, "translated RNA") || sameWord(type, "translated DNA"))
{
seqList = faSeqListFromMemText(seqLetters, TRUE);
uToT(seqList);
isTx = TRUE;
isTxTx = TRUE;
xType = "rnax";
txTxBoth = sameWord(type, "translated DNA");
if (txTxBoth)
xType = "dnax";
}
else if (sameWord(type, "protein"))
{
seqList = faSeqListFromMemText(seqLetters, FALSE);
isTx = TRUE;
qIsProt = TRUE;
xType = "prot";
}
else // BLAT's Guess
{
seqList = faSeqListFromMemTextRaw(seqLetters);
if (seqList)
{
isTx = !seqIsDna(seqList); // only tests first element, assumes the rest are the same type.
if (!isTx)
{
xType = "dna";
for (seq = seqList; seq != NULL; seq = seq->next)
{
seq->size = dnaFilteredSize(seq->dna);
dnaFilter(seq->dna, seq->dna);
toLowerN(seq->dna, seq->size);
subChar(seq->dna, 'u', 't');
}
}
else
{
for (seq = seqList; seq != NULL; seq = seq->next)
{
seq->size = aaFilteredSize(seq->dna);
aaFilter(seq->dna, seq->dna);
toUpperN(seq->dna, seq->size);
}
qIsProt = TRUE;
xType = "prot";
}
}
}
if (seqList != NULL && seqList->name[0] == 0)
{
freeMem(seqList->name);
seqList->name = cloneString("YourSeq");
}
trimUniq(seqList);
/* If feeling lucky only do the first one. */
if(feelingLucky && seqList != NULL)
{
seqList->next = NULL;
}
/* Figure out size allowed. */
maxSingleSize = (isTx ? 10000 : 75000);
maxTotalSize = maxSingleSize * 2.5;
#ifdef LOWELAB
maxSeqCount = 200;
#else
maxSeqCount = 25;
#endif
char *optionMaxSeqCount = cfgOptionDefault("hgBlat.maxSequenceCount", NULL);
if (isNotEmpty(optionMaxSeqCount))
maxSeqCount = sqlSigned(optionMaxSeqCount);
/* Create temporary file to store sequence. */
trashDirFile(&faTn, "hgSs", "hgSs", ".fa");
faWriteAll(faTn.forCgi, seqList);
/* Create a temporary .psl file with the alignments against genome. */
trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE);
/* Strategy for calculating minMatchShown.
Calculate the minimum size for filtering based on genome size.
The expected numberof occurrences of q = n/(4^k)
= N * pow(4,-k); where N is genome size, and k is minimum size required.
For protein, use 22 instead of 4.
For human, a cutoff of 20, expected number < .003 or 0.3%
so the probability of getting it just by chance is low.
*/
int xlat;
serve = findServer(db, isTx);
/* Write header for extended (possibly protein) psl file. */
if (isTx)
{
if (isTxTx)
{
qType = gftDnaX;
tType = gftDnaX;
xlat = 3;
}
else
{
qType = gftProt;
tType = gftDnaX;
xlat = 1;
}
serve->tileSize = 4;
serve->stepSize = 4;
serve->minMatch = 3;
}
else
{
qType = gftDna;
tType = gftDna;
serve->tileSize = 11;
serve->stepSize = 5;
serve->minMatch = 2;
xlat = 1;
}
pslxWriteHead(f, qType, tType);
long genomeSize = 3272116950; // substitute human genome size
// Use with message about recommended length when using short query
// This is the minimum that can find anything at all.
int minSuggested = 0;
if (allGenomes)
{
minMatchShown = 0;
}
else
{
#ifdef LOWELAB
minMatchShown = 14;
#else
// read genome size
if (trackHubDatabase(database))
{
genomeSize = findGenomeSizeFromHub(database);
}
else
{
genomeSize = findGenomeSize(database);
}
minMatchShown = findMinMatch(genomeSize, qType == gftProt);
#endif
if (allResults)
minMatchShown = 0;
conn = gfConnect(serve->host, serve->port, trackHubDatabaseToGenome(serve->db), serve->genomeDataDir);
// read tileSize stepSize minMatch from server status
findGenomeParams(conn, serve);
int minLucky = (serve->minMatch * serve->stepSize + (serve->tileSize - serve->stepSize)) * xlat;
minSuggested = max(minMatchShown,minLucky);
}
int seqNumber = 0;
/* Loop through each sequence. */
for (seq = seqList; seq != NULL; seq = seq->next)
{
printf(" "); fflush(stdout); /* prevent apache cgi timeout by outputting something */
oneSize = realSeqSize(seq, !isTx);
// Impose half the usual bot delay per sequence
if (dbCount == 0 && issueBotWarning)
{
char *ip = getenv("REMOTE_ADDR");
botDelayMessage(ip, botDelayMillis);
}
if (++seqCount > maxSeqCount)
{
warn("More than %d input sequences, stopping at %s
(see also: cgi-bin/hg.conf hgBlat.maxSequenceCount setting).",
maxSeqCount, seq->name);
break;
}
if (oneSize > maxSingleSize)
{
warn("Sequence %s is %d letters long (max is %d), skipping",
seq->name, oneSize, maxSingleSize);
continue;
}
if (oneSize < minSuggested)
{
warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)",
seq->name, oneSize, minSuggested);
// we could use "continue;" here to actually enforce skipping,
// but let's give the short sequence a chance, it might work.
// minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein.
if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0
continue;
}
totalSize += oneSize;
if (totalSize > maxTotalSize)
{
warn("Sequence %s would take us over the %d letter limit, stopping here.",
seq->name, maxTotalSize);
break;
}
if (isTx)
{
gvo->reportTargetStrand = TRUE;
if (isTxTx)
{
if (allGenomes)
queryServer(serve->host, serve->port, db, seq, "transQuery", xType, TRUE, FALSE, FALSE, seqNumber,
serve->genomeDataDir);
else
{
gfAlignTransTrans(conn, serve->nibDir, seq, FALSE, 5, tFileCache, gvo, !txTxBoth);
}
if (txTxBoth)
{
reverseComplement(seq->dna, seq->size);
if (allGenomes)
queryServer(serve->host, serve->port, db, seq, "transQuery", xType, TRUE, FALSE, TRUE, seqNumber,
serve->genomeDataDir);
else
{
gfAlignTransTrans(conn, serve->nibDir, seq, TRUE, 5, tFileCache, gvo, FALSE);
}
}
}
else
{
if (allGenomes)
queryServer(serve->host, serve->port, db, seq, "protQuery", xType, TRUE, TRUE, FALSE, seqNumber,
serve->genomeDataDir);
else
{
gfAlignTrans(conn, serve->nibDir, seq, 5, tFileCache, gvo);
}
}
}
else
{
if (allGenomes)
queryServer(serve->host, serve->port, db, seq, "query", xType, FALSE, FALSE, FALSE, seqNumber,
serve->genomeDataDir);
else
{
gfAlignStrand(conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo);
}
reverseComplement(seq->dna, seq->size);
if (allGenomes)
queryServer(serve->host, serve->port, db, seq, "query", xType, FALSE, FALSE, TRUE, seqNumber,
serve->genomeDataDir);
else
{
gfAlignStrand(conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo);
}
}
gfOutputQuery(gvo, f);
++seqNumber;
}
carefulClose(&f);
if (!allGenomes)
{
showAliPlaces(pslTn.forCgi, faTn.forCgi, NULL, serve->db, qType, tType,
organism, feelingLucky);
}
if(!feelingLucky && !allGenomes)
cartWebEnd();
gfFileCacheFree(&tFileCache);
}
void askForSeq(char *organism, char *db)
/* Put up a little form that asks for sequence.
* Call self.... */
{
/* ignore struct serverTable* return, but can error out if not found */
findServer(db, FALSE);
/* JavaScript to update form when org changes */
char *onChangeText = ""
"document.mainForm.changeInfo.value='orgChange';"
"document.mainForm.submit();";
char *userSeq = NULL;
char *type = NULL;
printf(
"\n");
webNewSection("About BLAT");
printf(
"
Kent WJ. BLAT - the BLAST-like alignment tool. \n" +"Genome Res. 2002 Apr;12(4):656-64. PMID: 11932250
"); } void fakeAskForSeqForm(char *organism, char *db) /* Put up a hidden form that asks for sequence. * Call self.... */ { char *userSeq = NULL; char *type = NULL; char *sort = NULL; char *output = NULL; printf("\n"); } void hideWeakerOfQueryRcPairs(struct genomeHits* gH1) /* hide the weaker of the pair of rc'd query results * so users sees only one strand with the best gene hit. * Input must be sorted already into the pairs. */ { struct genomeHits* gH2 = NULL; for (;gH1; gH1 = gH2->next) { gH2 = gH1->next; if (!gH2) errAbort("Hiding weaker of pairs found one without sibling."); if (!((gH1->seqNumber == gH2->seqNumber) && sameString(gH1->db, gH2->db) && (gH1->queryRC != gH2->queryRC))) errAbort("Error matching pairs, sibling does not match seqNumber and db."); // check if one or the other had an error if (gH1->error && gH2->error) gH2->hide = TRUE; // arbitrarily else if (gH1->error && !gH2->error) gH1->hide = TRUE; else if (!gH1->error && gH2->error) gH2->hide = TRUE; else // keep the best scoring of the pair, hide the other { if (gH2->maxGeneHits > gH1->maxGeneHits) gH1->hide = TRUE; else gH2->hide = TRUE; } } } void changeMaxGenePositionToPositiveStrandCoords(struct genomeHits *gH) /* convert negative strand coordinates to positive strand coordinates if TStrand=='-' */ { for (;gH; gH = gH->next) { if (gH->hide) continue; if (gH->error) continue; if (gH->maxGeneTStrand == '-') // convert to pos strand coordinates { int chromSize = 0; char temp[1024]; safef(temp, sizeof temp, "%s", ""); if (trackHubDatabase(gH->db)) // if not hub db, make sure it is the default assembly. { struct chromInfo *ci = trackHubMaybeChromInfo(gH->db, gH->maxGeneChrom); if (ci) chromSize = ci->size; else { warn("chromosome %s missing from %s .2bit (%s).", gH->maxGeneChrom, gH->db, gH->genome); safef(temp, sizeof temp, "chromosome %s missing from %s .2bit.", gH->maxGeneChrom, gH->db); } } else { struct sqlConnection *conn = hAllocConn(gH->db); char query[256]; sqlSafef(query, sizeof query, "select size from chromInfo where chrom='%s'", gH->maxGeneChrom); chromSize = sqlQuickNum(conn, query); hFreeConn(&conn); if (chromSize == 0) { warn("chromosome %s missing from %s.chromInfo (%s)", gH->maxGeneChrom, gH->db, gH->genome); safef(temp, sizeof temp, "chromosome %s missing from %s.chromInfo", gH->maxGeneChrom, gH->db); } } if (chromSize == 0) { gH->error = TRUE; gH->networkErrMsg = cloneString(temp); } else { gH->maxGeneChromSize = chromSize; int tempTStart = gH->maxGeneTStart; gH->maxGeneTStart = chromSize - gH->maxGeneTEnd; gH->maxGeneTEnd = chromSize - tempTStart; } } } } void printDebugging() /* print detailed gfServer response info */ { struct genomeHits *gH; printf("\n"); for (gH = pfdDone; gH; gH = gH->next) { if (gH->hide) // hide weaker of pairs for dna and dnax with reverse-complimented queries. continue; if (gH->error) printf("%s %s %s %s %d %s %s\n", gH->faName, gH->genome, gH->db, gH->networkErrMsg, gH->queryRC, gH->type, gH->xType); else { int tStart = gH->maxGeneTStart; int tEnd = gH->maxGeneTEnd; // convert back to neg strand coordinates for debug display if (gH->complex && (gH->maxGeneTStrand == '-')) { int tempTStart = tStart; tStart = gH->maxGeneChromSize - tEnd; tEnd = gH->maxGeneChromSize - tempTStart; } printf("%s %s %s %s %d %d %s %s %d %d\n", gH->faName, gH->genome, gH->db, gH->maxGeneChrom, gH->maxGeneHits, gH->queryRC, gH->type, gH->xType, tStart, tEnd); printf("%s", gH->dbg->string); struct gfResult *gfR = NULL; for(gfR=gH->gfList; gfR; gfR=gfR->next) { printf("(%3d)\t%4d\t%4d\t%s\t(%3d)\t%9d\t%9d\t%3d\t", (gfR->qEnd-gfR->qStart), gfR->qStart, gfR->qEnd, gfR->chrom, (gfR->tEnd-gfR->tStart), gfR->tStart, gfR->tEnd, gfR->numHits); if (gH->complex) { printf("%c\t%d\t", gfR->tStrand, gfR->tFrame); if (!gH->isProt) { printf("%d\t", gfR->qFrame); } } printf("\n"); } } printf("\n"); } printf("\n"); } void doMiddle(struct cart *theCart) /* Write header and body of html page. */ { char *userSeq; char *db, *organism; boolean clearUserSeq = cgiBoolean("Clear"); allGenomes = cgiVarExists("allGenomes"); cart = theCart; dnaUtilOpen(); orgChange = sameOk(cgiOptionalString("changeInfo"),"orgChange"); if (orgChange) { cgiVarSet("db", hDefaultDbForGenome(cgiOptionalString("org"))); } getDbAndGenome(cart, &db, &organism, oldVars); char *oldDb = cloneString(db); // n.b. this changes to default db if db doesn't have BLAT findClosestServer(&db, &organism); allResults = cartUsualBoolean(cart, "allResults", allResults); /* Get sequence - from userSeq variable, or if * that is empty from a file. */ if (clearUserSeq) { cartSetString(cart, "userSeq", ""); cartSetString(cart, "seqFile", ""); } userSeq = cartUsualString(cart, "userSeq", ""); if (isEmpty(userSeq)) { userSeq = cartOptionalString(cart, "seqFile"); } if (isEmpty(userSeq) || orgChange) { cartWebStart(theCart, db, "%s BLAT Search", trackHubSkipHubName(organism)); if (differentString(oldDb, db)) printf("
Note: BLAT search is not available for %s %s; " "defaulting to %s %s
Name | " "Genome | " "Assembly | " ); printf( "Tiles | " "Chrom | " ); if (debuggingGfResults) { printf( "Pos | " "Strand | " "Exons | " "Query RC'd | " "Type | " ); } printf("\n"); printf("|||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
%s | %s | %s | %s | ", gH->faName, trackHubSkipHubName(gH->genome), trackHubSkipHubName(gH->db), gH->networkErrMsg); if (debuggingGfResults) printf(" | %d | %s | ", gH->queryRC, gH->type); printf("\n"); } else { char pos[256]; safef(pos, sizeof pos, "%s:%d-%d", gH->maxGeneChrom, gH->maxGeneTStart+1, gH->maxGeneTEnd); // 1-based closed coord if (!gH->maxGeneChrom) // null pos[0] = 0; // empty string safef(id, sizeof id, "res%d", idCount); printf(" | %s | %s | " "%s | " , gH->faName, trackHubSkipHubName(gH->genome), id, trackHubSkipHubName(gH->db)); printf("%d | %s | ", gH->maxGeneHits, gH->maxGeneChrom ? gH->maxGeneChrom : ""); if (debuggingGfResults) { printf("%s | %s | ", pos, gH->maxGeneStrand); printf( "%d | %d | %s | ", gH->maxGeneExons, gH->queryRC, gH->xType); } printf("\n"); jsOnEventByIdF("click", id, "document.mainForm.org.value=\"%s\";" // some have single-quotes in their value. "document.mainForm.db.value='%s';" "document.mainForm.submit();" "return false;" // cancel the default link url , gH->genome, gH->db ); idCount++; } printf("