\n");
}
pslFreeList(&pslList);
}
void trimUniq(bioSeq *seqList)
/* Check that all seq's in list have a unique name. Try and
* abbreviate longer sequence names. */
{
struct hash *hash = newHash(0);
bioSeq *seq;
for (seq = seqList; seq != NULL; seq = seq->next)
{
char *saferString = needMem(strlen(seq->name)+1);
char *c, *s;
/* Some chars are safe to allow through, other chars cause
* problems. It isn't necessarily a URL safe string that is
* being calculated here. The original problem was a user had
* the fasta header line of:
* chr8|59823648:59825047|+
* The plus sign was being taken as the query name and this
* created problems as that name was passed on to hgc via
* the ss cart variable. The + sign became part of a URL
* eventually. This loop allows only isalnum and =_/.:;_|
* to get through as part of the header name. These characters
* all proved to be safe as single character names, or all
* together.
*/
s = saferString;
for (c = seq->name; *c != '\0'; ++c)
{
if (c && (*c != '\0'))
{
if ( isalnum(*c) || (*c == '=') || (*c == '-') || (*c == '/') ||
(*c == '.') || (*c == ':') || (*c == ';') || (*c == '_') ||
(*c == '|') )
*s++ = *c;
}
}
*s = '\0';
freeMem(seq->name);
if (*saferString == '\0')
{
freeMem(saferString);
saferString = cloneString("YourSeq");
}
seq->name = saferString;
if (strlen(seq->name) > 14) /* Try and get rid of long NCBI .fa cruft. */
{
char *nameClone = NULL;
char *abbrv = NULL;
char *words[32];
int wordCount;
boolean isEns = (stringIn("ENSEMBL:", seq->name) != NULL);
nameClone = cloneString(seq->name);
wordCount = chopString(nameClone, "|", words, ArraySize(words));
if (wordCount > 1) /* Looks like it's an Ensembl/NCBI
* long name alright. */
{
if (isEns)
{
abbrv = words[0];
if (abbrv[0] == 0) abbrv = words[1];
}
else if (sameString(words[1], "dbSNP"))
{
if (wordCount > 2)
abbrv = words[2];
else
abbrv = nameClone;
}
else
{
abbrv = words[wordCount-1];
if (abbrv[0] == 0) abbrv = words[wordCount-2];
}
if (hashLookup(hash, abbrv) == NULL)
{
freeMem(seq->name);
seq->name = cloneString(abbrv);
}
freez(&nameClone);
}
}
hashAddUnique(hash, seq->name, hash);
}
freeHash(&hash);
}
int realSeqSize(bioSeq *seq, boolean isDna)
/* Return size of sequence without N's or (for proteins)
* X's. */
{
char unknown = (isDna ? 'n' : 'X');
int i, size = seq->size, count = 0;
char *s = seq->dna;
for (i=0; inext)
subChar(seq->dna, 'u', 't');
}
static struct slName *namesInPsl(struct psl *psl)
/* Find all the unique names in a list of psls. */
{
struct hash *hash = newHash(3);
struct slName *nameList = NULL;
struct slName *name;
for(; psl; psl = psl->next)
{
if (hashLookup(hash, psl->qName) == NULL)
{
name = slNameNew(psl->qName);
slAddHead(&nameList, name);
hashStore(hash, psl->qName);
}
}
slReverse(&nameList);
return nameList;
}
static char *makeNameUnique(char *name, char *database, struct cart *cart)
/* Make sure track name will create a unique custom track. */
{
struct slName *browserLines = NULL;
struct customTrack *ctList = customTracksParseCart(database, cart, &browserLines, NULL);
struct customTrack *ct;
int count = 0;
char buffer[4096];
safef(buffer, sizeof buffer, "%s", name);
for(;;count++)
{
char *customName = customTrackTableFromLabel(buffer);
for (ct=ctList;
ct != NULL;
ct=ct->next)
{
if (startsWith(customName, ct->tdb->track))
// Found a track with this name.
break;
}
if (ct == NULL)
break;
safef(buffer, sizeof buffer, "%s (%d)", name, count + 1);
}
return cloneString(buffer);
}
static void getCustomName(char *database, struct cart *cart, struct psl *psl, char **pName, char **pDescription)
// Find a track name that isn't currently a custom track. Also fill in description.
{
struct slName *names = namesInPsl(psl);
char shortName[4096];
char description[4096];
unsigned count = slCount(names);
if (count == 1)
{
safef(shortName, sizeof shortName, "blat %s", names->name);
safef(description, sizeof description, "blat on %s", names->name);
}
else if (count == 2)
{
safef(shortName, sizeof shortName, "blat %s+%d", names->name, count - 1);
safef(description, sizeof description, "blat on %d queries (%s, %s)", count, names->name, names->next->name);
}
else
{
safef(shortName, sizeof shortName, "blat %s+%d", names->name, count - 1);
safef(description, sizeof description, "blat on %d queries (%s, %s, ...)", count, names->name, names->next->name);
}
*pName = makeNameUnique(shortName, database, cart);
*pDescription = cloneString(description);
}
void queryServer(int conn, char *db, struct dnaSeq *seq, char *type, char *xType,
boolean complex, boolean isProt, boolean queryRC, int seqNumber)
/* Send simple query to server and report results.
* queryRC is true when the query has been reverse-complemented */
{
struct genomeHits *gH;
AllocVar(gH);
gH->db = cloneString(db);
gH->genome = cloneString(hGenome(db));
gH->seqNumber = seqNumber;
gH->faName = cloneString(seq->name);
gH->dna = cloneString(seq->dna);
gH->dnaSize = seq->size;
gH->type = cloneString(type);
gH->xType = cloneString(xType);
gH->queryRC = queryRC;
gH->complex = complex;
gH->isProt = isProt;
gH->sd = conn;
if (gH->sd == -1)
{
gH->error = TRUE;
gH->networkErrMsg = "Connection to gfServer failed.";
}
gH->dbg = dyStringNew(256);
slAddHead(&pfdList, gH);
}
void findBestGene(struct genomeHits *gH, int queryFrame)
/* Find best gene-like object with multiple linked-features.
* Remember chrom start end of best gene found and total hits in the gene.
* Should sort the gfResults by tStrand, chrom, tStart.
* Filters on queryFrame */
{
char *bestChrom = NULL;
int bestHits = 0;
int bestTStart = 0;
int bestTEnd = 0;
int bestExons = 0;
char bestTStrand = ' ';
char bestQStrand = ' ';
char *thisChrom = NULL;
int thisHits = 0;
int thisTStart = 0;
int thisTEnd = 0;
int thisExons = 0;
char thisTStrand = ' ';
char thisQStrand = ' ';
int geneGap = 1000000; // about a million bases is a cutoff for genes.
// TODO should this really be 750K?
int qSlop = 5; // forgive 5 bases, about dna stepSize = 5.
if (gH->complex)
qSlop = 4; // reduce for translated with tileSize/stepSize = 4.
struct gfResult *gfR = NULL, *gfLast = NULL;
for(gfR=gH->gfList; gfR; gfR=gfR->next)
{
// filter on given queryFrame
if (gfR->qFrame != queryFrame)
continue;
if (gfLast && (gfR->tStrand == gfLast->tStrand) && sameString(gfR->chrom,gfLast->chrom) &&
(gfR->qStart >= (gfLast->qEnd - qSlop)) && (gfR->tStart >= gfLast->tEnd) && ((gfR->tStart - gfLast->tEnd) < geneGap))
{
thisHits += gfR->numHits;
thisTEnd = gfR->tEnd;
thisExons += 1;
//if (gH->complex)
//{
// gfR->tStrand, gfR->tFrame
//if (gH->complex)
//{
//gfR->qFrame
//}
//}
}
else
{
thisHits = gfR->numHits;
thisChrom = gfR->chrom;
thisTStart = gfR->tStart;
thisTEnd = gfR->tEnd;
thisExons = 1;
thisTStrand = gfR->tStrand;
thisQStrand = gfR->qStrand;
}
if (thisHits > bestHits)
{
bestHits = thisHits;
bestExons = thisExons;
bestChrom = thisChrom;
bestTStart = thisTStart;
bestTEnd = thisTEnd;
bestTStrand = thisTStrand;
bestQStrand = thisQStrand;
}
gfLast = gfR;
}
// save best gene with max hits
gH->maxGeneHits = bestHits;
gH->maxGeneChrom = cloneString(bestChrom);
gH->maxGeneTStart = bestTStart;
gH->maxGeneTEnd = bestTEnd;
gH->maxGeneExons = bestExons;
gH->maxGeneTStrand = bestTStrand; // tells if we need to change to get pos strand coords.
if (gH->complex)
{
safef(gH->maxGeneStrand, sizeof gH->maxGeneStrand, "%c%c", bestQStrand, bestTStrand);
}
else
{
safef(gH->maxGeneStrand, sizeof gH->maxGeneStrand, "%c", bestQStrand);
}
}
void unTranslateCoordinates(struct genomeHits *gH)
/* convert translated coordinates back to dna for all but prot query */
{
if (!gH->complex)
return;
int qFactor = 3;
int tFactor = 3;
if (gH->isProt)
qFactor = 1;
struct gfResult *gfR = NULL;
for(gfR=gH->gfList; gfR; gfR=gfR->next)
{
gfR->qStart = gfR->qStart * qFactor + gfR->qFrame;
gfR->qEnd = gfR->qEnd * qFactor + gfR->qFrame;
gfR->tStart = gfR->tStart * tFactor + gfR->tFrame;
gfR->tEnd = gfR->tEnd * tFactor + gfR->tFrame;
}
}
void queryServerFinish(struct genomeHits *gH)
/* Report results from gfServer. */
{
char buf[256];
int matchCount = 0;
dyStringPrintf(gH->dbg,"query strand %s qsize %d \n", gH->queryRC ? "-" : "+", gH->dnaSize);
/* Put together query command. */
safef(buf, sizeof buf, "%s%s %d", gfSignature(), gH->type, gH->dnaSize);
mustWriteFd(gH->sd, buf, strlen(buf));
if (read(gH->sd, buf, 1) < 0)
errAbort("queryServerFinish: read failed: %s", strerror(errno));
if (buf[0] != 'Y')
errAbort("Expecting 'Y' from server, got %c", buf[0]);
mustWriteFd(gH->sd, gH->dna, gH->dnaSize);
if (gH->complex)
{
char *s = netRecieveString(gH->sd, buf);
if (!s)
errAbort("expected response from gfServer with tileSize");
dyStringPrintf(gH->dbg,"%s \n", s); // from server: tileSize 4
}
for (;;)
{
if (netGetString(gH->sd, buf) == NULL)
break;
if (sameString(buf, "end"))
{
dyStringPrintf(gH->dbg,"%d matches \n", matchCount);
break;
}
else if (startsWith("Error:", buf))
{
errAbort("%s", buf);
break;
}
else
{
dyStringPrintf(gH->dbg,"%s \n", buf);
// chop the line into words
int numHits = 0;
char *line = buf;
char *word=NULL;
int i=0;
struct gfResult *gfR = NULL;
AllocVar(gfR);
gfR->qStrand = (gH->queryRC ? '-' : '+');
while ((word = nextWord(&line)) != NULL)
{
if (i == 0)
{ // e.g. 3139
gfR->qStart = sqlUnsigned(word);
}
if (i == 1)
{ // e.g. 3220
gfR->qEnd = sqlUnsigned(word);
}
if (i == 2)
{ // e.g. hg38.2bit:chr1
char *colon = strchr(word, ':');
if (colon)
{
gfR->chrom = cloneString(colon+1);
}
else
{ // e.g. chr10.nib
char *dot = strchr(word, '.');
if (dot)
{
*dot = 0;
gfR->chrom = cloneString(word);
}
else
errAbort("Expecting colon or period in the 3rd field of gfServer response");
}
}
if (i == 3)
{ // e.g. 173515
gfR->tStart = sqlUnsigned(word);
}
if (i == 4)
{ // e.g. 173586
gfR->tEnd = sqlUnsigned(word);
}
if (i == 5)
{ // e.g. 14
numHits = sqlUnsigned(word);
// Frustrated with weird little results with vastly exaggerated hit-score,
// I have flattened that out so it maxes out at #tiles that could fit
// It seems to still work just fine. I am going to leave it for now.
// One minor note, by suppressing extra scores for short exons,
// it will not prefer alignments with the same number of hits, but more exons.
if (!gH->complex) // dna xType
{
// maximum tiles that could fit in qSpan
int limit = ((gfR->qEnd - gfR->qStart) - 6)/5;
++limit; // for a little extra.
if (numHits > limit)
numHits = limit;
}
gfR->numHits = numHits;
}
if (gH->complex)
{
if (i == 6)
{ // e.g. + or -
gfR->tStrand = word[0];
}
if (i == 7)
{ // e.g. 0,1,2
gfR->tFrame = sqlUnsigned(word);
}
if (!gH->isProt)
{
if (i == 8)
{ // e.g. 0,1,2
gfR->qFrame = sqlUnsigned(word);
}
}
}
else
{
gfR->tStrand = '+'; // dna search only on + target strand
}
++i;
}
if (gH->complex)
{
char *s = netGetLongString(gH->sd);
if (s == NULL)
break;
dyStringPrintf(gH->dbg,"%s \n", s); //dumps out qstart1 start1 qstart2 tstart2 ...
freeMem(s);
}
slAddHead(&gH->gfList, gfR);
}
++matchCount;
}
slReverse(&gH->gfList);
unTranslateCoordinates(gH); // convert back to untranslated coordinates
slSort(&gH->gfList, slGfResultsCmp); // sort by tStrand, chrom, tStart
struct qFrameResults
/* Information about hits on a genome assembly */
{
int maxGeneHits; /* Highest gene hit-count */
char *maxGeneChrom; /* Target Chrom for gene with max gene hits */
int maxGeneTStart; /* Target Start Coordinate for gene with max hits */
int maxGeneTEnd; /* Target End Coordinate for gene with max hits*/
int maxGeneExons; /* Number of Exons in gene with max hits */
char maxGeneTStrand;/* + or - TStrand for gene with max hits */
};
int qFrame = 0;
int qFrameStart = 0;
int qFrameEnd = 0;
if (gH->complex && !gH->isProt) // rnax, dnax
{
qFrameEnd = 2;
}
struct qFrameResults r[3];
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
findBestGene(gH, qFrame); // find best gene-line thing with most hits
r[qFrame].maxGeneHits = gH->maxGeneHits;
r[qFrame].maxGeneChrom = cloneString(gH->maxGeneChrom);
r[qFrame].maxGeneTStart = gH->maxGeneTStart;
r[qFrame].maxGeneTEnd = gH->maxGeneTEnd;
r[qFrame].maxGeneExons = gH->maxGeneExons;
r[qFrame].maxGeneTStrand = gH->maxGeneTStrand;
}
// combine the results from all 3 qFrames
if (gH->complex && !gH->isProt) // rnax, dnax
{
int biggestHit = -1;
int bigQFrame = -1;
// find the qFrame with the biggest hits
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
if (r[qFrame].maxGeneHits > biggestHit)
{
bigQFrame = qFrame;
biggestHit = r[qFrame].maxGeneHits;
}
}
// save combined final answers back in gH
gH->maxGeneHits = 0;
gH->maxGeneTStrand = r[bigQFrame].maxGeneTStrand;
gH->maxGeneChrom = cloneString(r[bigQFrame].maxGeneChrom);
gH->maxGeneExons = r[bigQFrame].maxGeneExons;
gH->maxGeneTStart = r[bigQFrame].maxGeneTStart;
gH->maxGeneTEnd = r[bigQFrame].maxGeneTEnd;
for(qFrame = qFrameStart; qFrame <= qFrameEnd; ++qFrame)
{
// ignore frames that do not have same tStrand, chrom, and overlap with the best
if (!(
(r[qFrame].maxGeneTStrand == r[bigQFrame].maxGeneTStrand) &&
sameOk(r[qFrame].maxGeneChrom, r[bigQFrame].maxGeneChrom) &&
// overlap start,end between this and bigQFrame
(
(r[qFrame].maxGeneTEnd > r[bigQFrame].maxGeneTStart) &&
(r[bigQFrame].maxGeneTEnd > r[qFrame].maxGeneTStart)
)
))
{
continue; // incompatible with best result, skip it
}
// Add total hits
gH->maxGeneHits += r[qFrame].maxGeneHits; // should be pretty accurate
// Find biggest exon count
if (gH->maxGeneExons < r[qFrame].maxGeneExons) // often underestimates actual exon count.
{
gH->maxGeneExons = r[qFrame].maxGeneExons;
}
// Adjust tStart
if (gH->maxGeneTStart > r[qFrame].maxGeneTStart)
{
gH->maxGeneTStart = r[qFrame].maxGeneTStart;
}
// Adjust tEnd
if (gH->maxGeneTEnd < r[qFrame].maxGeneTEnd)
{
gH->maxGeneTEnd = r[qFrame].maxGeneTEnd;
}
}
gH->maxGeneHits /= 3; // average over 3 frames.
char qStrand = (gH->queryRC ? '-' : '+');
safef(gH->maxGeneStrand, sizeof gH->maxGeneStrand, "%c%c", qStrand, gH->maxGeneTStrand);
}
close(gH->sd);
}
int gfConnectEx(char *host, char *port)
/* Try to connect to gfServer */
{
int conn = -1;
if (allGenomes)
conn = gfMayConnect(host, port); // returns -1 on failure
else
conn = gfConnect(host, port); // errAborts on failure.
return conn;
}
void blatSeq(char *userSeq, char *organism, char *database, int dbCount)
/* Blat sequence user pasted in. */
{
FILE *f;
struct dnaSeq *seqList = NULL, *seq;
struct tempName pslTn, faTn;
int maxSingleSize, maxTotalSize, maxSeqCount;
int minSingleSize = minMatchShown;
char *genome, *db;
char *type = cgiString("type");
char *seqLetters = cloneString(userSeq);
struct serverTable *serve;
int conn;
int oneSize, totalSize = 0, seqCount = 0;
boolean isTx = FALSE;
boolean isTxTx = FALSE;
boolean txTxBoth = FALSE;
struct gfOutput *gvo;
boolean qIsProt = FALSE;
enum gfType qType, tType;
struct hash *tFileCache = gfFileCacheNew();
// allGenomes ignores I'm Feeling Lucky for simplicity
boolean feelingLucky = cgiBoolean("Lucky") && !allGenomes;
char *xType = NULL;
if (allGenomes)
{
db = database;
genome = organism;
}
else
getDbAndGenome(cart, &db, &genome, oldVars);
if(!feelingLucky && !allGenomes)
cartWebStart(cart, db, "%s (%s) BLAT Results", trackHubSkipHubName(organism), db);
/* Load user sequence and figure out if it is DNA or protein. */
if (sameWord(type, "DNA"))
{
seqList = faSeqListFromMemText(seqLetters, TRUE);
uToT(seqList);
isTx = FALSE;
xType = "dna";
}
else if (sameWord(type, "translated RNA") || sameWord(type, "translated DNA"))
{
seqList = faSeqListFromMemText(seqLetters, TRUE);
uToT(seqList);
isTx = TRUE;
isTxTx = TRUE;
xType = "rnax";
txTxBoth = sameWord(type, "translated DNA");
if (txTxBoth)
xType = "dnax";
}
else if (sameWord(type, "protein"))
{
seqList = faSeqListFromMemText(seqLetters, FALSE);
isTx = TRUE;
qIsProt = TRUE;
xType = "prot";
}
else // BLAT's Guess
{
seqList = faSeqListFromMemTextRaw(seqLetters);
isTx = !seqIsDna(seqList); // only tests first element, assumes the rest are the same type.
if (!isTx)
{
xType = "dna";
for (seq = seqList; seq != NULL; seq = seq->next)
{
seq->size = dnaFilteredSize(seq->dna);
dnaFilter(seq->dna, seq->dna);
toLowerN(seq->dna, seq->size);
subChar(seq->dna, 'u', 't');
}
}
else
{
for (seq = seqList; seq != NULL; seq = seq->next)
{
seq->size = aaFilteredSize(seq->dna);
aaFilter(seq->dna, seq->dna);
toUpperN(seq->dna, seq->size);
}
qIsProt = TRUE;
xType = "prot";
}
}
if (seqList != NULL && seqList->name[0] == 0)
{
freeMem(seqList->name);
seqList->name = cloneString("YourSeq");
}
trimUniq(seqList);
/* If feeling lucky only do the first one. */
if(feelingLucky && seqList != NULL)
{
seqList->next = NULL;
}
/* Figure out size allowed. */
maxSingleSize = (isTx ? 10000 : 75000);
maxTotalSize = maxSingleSize * 2.5;
#ifdef LOWELAB
maxSeqCount = 200;
#else
maxSeqCount = 25;
#endif
char *optionMaxSeqCount = cfgOptionDefault("hgBlat.maxSequenceCount", NULL);
if (isNotEmpty(optionMaxSeqCount))
maxSeqCount = sqlSigned(optionMaxSeqCount);
/* Create temporary file to store sequence. */
trashDirFile(&faTn, "hgSs", "hgSs", ".fa");
faWriteAll(faTn.forCgi, seqList);
/* Create a temporary .psl file with the alignments against genome. */
trashDirFile(&pslTn, "hgSs", "hgSs", ".pslx");
f = mustOpen(pslTn.forCgi, "w");
gvo = gfOutputPsl(0, qIsProt, FALSE, f, FALSE, TRUE);
serve = findServer(db, isTx);
/* Write header for extended (possibly protein) psl file. */
if (isTx)
{
if (isTxTx)
{
qType = gftDnaX;
tType = gftDnaX;
}
else
{
qType = gftProt;
tType = gftDnaX;
}
}
else
{
qType = gftDna;
tType = gftDna;
}
pslxWriteHead(f, qType, tType);
if (qType == gftProt)
{
minSingleSize = 14;
}
else if (qType == gftDnaX)
{
minSingleSize = 36;
}
int seqNumber = 0;
/* Loop through each sequence. */
for (seq = seqList; seq != NULL; seq = seq->next)
{
printf(" "); fflush(stdout); /* prevent apache cgi timeout by outputting something */
oneSize = realSeqSize(seq, !isTx);
// Impose half the usual bot delay per sequence
if (dbCount == 0)
hgBotDelayFrac(0.5);
if (++seqCount > maxSeqCount)
{
warn("More than %d input sequences, stopping at %s (see also: cgi-bin/hg.conf hgBlat.maxSequenceCount setting).",
maxSeqCount, seq->name);
break;
}
if (oneSize > maxSingleSize)
{
warn("Sequence %s is %d letters long (max is %d), skipping",
seq->name, oneSize, maxSingleSize);
continue;
}
if (oneSize < minSingleSize)
{
warn("Warning: Sequence %s is only %d letters long (%d is the recommended minimum)",
seq->name, oneSize, minSingleSize);
// we could use "continue;" here to actually enforce skipping,
// but let's give the short sequence a chance, it might work.
// minimum possible length = tileSize+stepSize, so mpl=16 for dna stepSize=5, mpl=10 for protein.
if (qIsProt && oneSize < 1) // protein does not tolerate oneSize==0
continue;
}
totalSize += oneSize;
if (totalSize > maxTotalSize)
{
warn("Sequence %s would take us over the %d letter limit, stopping here.",
seq->name, maxTotalSize);
break;
}
conn = gfConnectEx(serve->host, serve->port);
if (isTx)
{
gvo->reportTargetStrand = TRUE;
if (isTxTx)
{
if (allGenomes)
queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, FALSE, seqNumber);
else
gfAlignTransTrans(&conn, serve->nibDir, seq, FALSE, 5,
tFileCache, gvo, !txTxBoth);
if (txTxBoth)
{
reverseComplement(seq->dna, seq->size);
conn = gfConnectEx(serve->host, serve->port);
if (allGenomes)
queryServer(conn, db, seq, "transQuery", xType, TRUE, FALSE, TRUE, seqNumber);
else
gfAlignTransTrans(&conn, serve->nibDir, seq, TRUE, 5,
tFileCache, gvo, FALSE);
}
}
else
{
if (allGenomes)
queryServer(conn, db, seq, "protQuery", xType, TRUE, TRUE, FALSE, seqNumber);
else
gfAlignTrans(&conn, serve->nibDir, seq, 5, tFileCache, gvo);
}
}
else
{
if (allGenomes)
queryServer(conn, db, seq, "query", xType, FALSE, FALSE, FALSE, seqNumber);
else
gfAlignStrand(&conn, serve->nibDir, seq, FALSE, minMatchShown, tFileCache, gvo);
reverseComplement(seq->dna, seq->size);
conn = gfConnectEx(serve->host, serve->port);
if (allGenomes)
queryServer(conn, db, seq, "query", xType, FALSE, FALSE, TRUE, seqNumber);
else
gfAlignStrand(&conn, serve->nibDir, seq, TRUE, minMatchShown, tFileCache, gvo);
}
gfOutputQuery(gvo, f);
++seqNumber;
}
carefulClose(&f);
if (!allGenomes)
{
showAliPlaces(pslTn.forCgi, faTn.forCgi, NULL, serve->db, qType, tType,
organism, feelingLucky);
}
if(!feelingLucky && !allGenomes)
cartWebEnd();
gfFileCacheFree(&tFileCache);
}
void askForSeq(char *organism, char *db)
/* Put up a little form that asks for sequence.
* Call self.... */
{
/* ignore struct serverTable* return, but can error out if not found */
findServer(db, FALSE);
/* JavaScript to update form when org changes */
char *onChangeText = ""
"document.mainForm.changeInfo.value='orgChange';"
"document.mainForm.submit();";
char *userSeq = NULL;
char *type = NULL;
printf(
"\n");
webNewSection("About BLAT");
printf(
"
BLAT on DNA is designed to\n"
"quickly find sequences of 95%% and greater similarity of length 25 bases or\n"
"more. It may miss more divergent or shorter sequence alignments. It will find\n"
"perfect sequence matches of 20 bases.\n"
"BLAT on proteins finds sequences of 80%% and greater similarity of length 20 amino\n"
"acids or more. In practice DNA BLAT works well on primates, and protein\n"
-"blat on land vertebrates."
+"BLAT on land vertebrates."
);
printf("%s",
"\n
BLAT is not BLAST. DNA BLAT works by keeping an index of the entire genome\n"
"in memory. The index consists of all overlapping 11-mers stepping by 5 except for\n"
"those heavily involved in repeats. The index takes up about\n"
"2 gigabytes of RAM. RAM can be further reduced to less than 1 GB by increasing step size to 11.\n"
"The genome itself is not kept in memory, allowing\n"
"BLAT to deliver high performance on a reasonably priced Linux box.\n"
"The index is used to find areas of probable homology, which are then\n"
"loaded into memory for a detailed alignment. Protein BLAT works in a similar\n"
"manner, except with 4-mers rather than 11-mers. The protein index takes a little\n"
"more than 2 gigabytes.
\n"
"
BLAT was written by Jim Kent.\n"
"Like most of Jim's software, interactive use on this web server is free to all.\n"
"Sources and executables to run batch jobs on your own server are available free\n"
"for academic, personal, and non-profit purposes. Non-exclusive commercial\n"
"licenses are also available. See the \n"
"Kent Informatics\n"
"website for details.
\n"
"\n"
"
For more information on the graphical version of BLAT, click the Help \n"
"button on the top menu bar");
if (hIsGsidServer())
printf(".
\n");
else
printf(" or see the Genome Browser FAQ.
\n");
}
void fakeAskForSeqForm(char *organism, char *db)
/* Put up a hidden form that asks for sequence.
* Call self.... */
{
char *userSeq = NULL;
char *type = NULL;
char *sort = NULL;
char *output = NULL;
printf("\n");
}
void hideWeakerOfQueryRcPairs(struct genomeHits* gH1)
/* hide the weaker of the pair of rc'd query results
* so users sees only one strand with the best gene hit.
* Input must be sorted already into the pairs. */
{
struct genomeHits* gH2 = NULL;
for (;gH1; gH1 = gH2->next)
{
gH2 = gH1->next;
if (!gH2)
errAbort("Hiding weaker of pairs found one without sibling.");
if (!((gH1->seqNumber == gH2->seqNumber) && sameString(gH1->db, gH2->db) && (gH1->queryRC != gH2->queryRC)))
errAbort("Error matching pairs, sibling does not match seqNumber and db.");
// check if one or the other had an error
if (gH1->error && gH2->error)
gH2->hide = TRUE; // arbitrarily
else if (gH1->error && !gH2->error)
gH1->hide = TRUE;
else if (!gH1->error && gH2->error)
gH2->hide = TRUE;
else // keep the best scoring or the pair, hide the other
{
if (gH2->maxGeneHits > gH1->maxGeneHits)
gH1->hide = TRUE;
else
gH2->hide = TRUE;
}
}
}
void changeMaxGenePositionToPositiveStrandCoords(struct genomeHits *gH)
/* convert negative strand coordinates to positive strand coordinates if TStrand=='-' */
{
for (;gH; gH = gH->next)
{
if (gH->hide)
continue;
if (gH->error)
continue;
if (gH->maxGeneTStrand == '-') // convert to pos strand coordinates
{
+ int chromSize = 0;
+ char temp[1024];
+ safef(temp, sizeof temp, "%s", "");
+ if (trackHubDatabase(gH->db)) // if not hub db, make sure it is the default assembly.
+ {
+
+ struct chromInfo *ci = trackHubMaybeChromInfo(gH->db, gH->maxGeneChrom);
+ if (ci)
+ chromSize = ci->size;
+ else
+ {
+ warn("chromosome %s missing from %s .2bit (%s).", gH->maxGeneChrom, gH->db, gH->genome);
+ safef(temp, sizeof temp, "chromosome %s missing from %s .2bit.", gH->maxGeneChrom, gH->db);
+ }
+ }
+ else
+ {
struct sqlConnection *conn = hAllocConn(gH->db);
char query[256];
sqlSafef(query, sizeof query, "select size from chromInfo where chrom='%s'", gH->maxGeneChrom);
- int chromSize = sqlQuickNum(conn, query);
+ chromSize = sqlQuickNum(conn, query);
hFreeConn(&conn);
if (chromSize == 0)
{
warn("chromosome %s missing from %s.chromInfo (%s)", gH->maxGeneChrom, gH->db, gH->genome);
- char temp[256];
safef(temp, sizeof temp, "chromosome %s missing from %s.chromInfo", gH->maxGeneChrom, gH->db);
+ }
+ }
+ if (chromSize == 0)
+ {
gH->error = TRUE;
gH->networkErrMsg = cloneString(temp);
}
else
{
gH->maxGeneChromSize = chromSize;
int tempTStart = gH->maxGeneTStart;
gH->maxGeneTStart = chromSize - gH->maxGeneTEnd;
gH->maxGeneTEnd = chromSize - tempTStart;
}
}
}
}
void printDebugging()
/* print detailed gfServer response info */
{
struct genomeHits *gH;
printf("
\n");
for (gH = pfdDone; gH; gH = gH->next)
{
if (gH->hide) // hide weaker of pairs for dna and dnax with reverse-complimented queries.
continue;
if (gH->error)
printf("%s %s %s %s %d %s %s\n",
gH->faName, gH->genome, gH->db, gH->networkErrMsg, gH->queryRC, gH->type, gH->xType);
else
{
int tStart = gH->maxGeneTStart;
int tEnd = gH->maxGeneTEnd;
// convert back to neg strand coordinates for debug display
if (gH->complex && (gH->maxGeneTStrand == '-'))
{
int tempTStart = tStart;
tStart = gH->maxGeneChromSize - tEnd;
tEnd = gH->maxGeneChromSize - tempTStart;
}
printf("%s %s %s %s %d %d %s %s %d %d\n",
gH->faName, gH->genome,
gH->db,
gH->maxGeneChrom, gH->maxGeneHits, gH->queryRC, gH->type, gH->xType, tStart, tEnd);
printf("%s", gH->dbg->string);
struct gfResult *gfR = NULL;
for(gfR=gH->gfList; gfR; gfR=gfR->next)
{
printf("(%3d)\t%4d\t%4d\t%s\t(%3d)\t%9d\t%9d\t%3d\t",
(gfR->qEnd-gfR->qStart),
gfR->qStart, gfR->qEnd, gfR->chrom,
(gfR->tEnd-gfR->tStart),
gfR->tStart, gfR->tEnd, gfR->numHits);
if (gH->complex)
{
printf("%c\t%d\t", gfR->tStrand, gfR->tFrame);
if (!gH->isProt)
{
printf("%d\t", gfR->qFrame);
}
}
printf("\n");
}
}
printf("\n");
}
printf("
\n");
}
void doMiddle(struct cart *theCart)
/* Write header and body of html page. */
{
char *userSeq;
char *db, *organism;
boolean clearUserSeq = cgiBoolean("Clear");
allGenomes = cgiVarExists("allGenomes");
cart = theCart;
dnaUtilOpen();
orgChange = sameOk(cgiOptionalString("changeInfo"),"orgChange");
if (orgChange)
{
cgiVarSet("db", hDefaultDbForGenome(cgiOptionalString("org")));
}
getDbAndGenome(cart, &db, &organism, oldVars);
char *oldDb = cloneString(db);
findClosestServer(&db, &organism);
/* Get sequence - from userSeq variable, or if
* that is empty from a file. */
if (clearUserSeq)
{
cartSetString(cart, "userSeq", "");
cartSetString(cart, "seqFile", "");
}
userSeq = cartUsualString(cart, "userSeq", "");
if (isEmpty(userSeq))
{
userSeq = cartOptionalString(cart, "seqFile");
}
if (isEmpty(userSeq) || orgChange)
{
cartWebStart(theCart, db, "%s BLAT Search", trackHubSkipHubName(organism));
if (differentString(oldDb, db))
printf("
Note: BLAT search is not available for %s %s; "
"defaulting to %s %s
\n",
hGenome(oldDb), hFreezeDate(oldDb), organism, hFreezeDate(db));
askForSeq(organism,db);
cartWebEnd();
}
else
{
if (allGenomes)
{
cartWebStart(cart, db, "ALL Genomes BLAT Results");
struct dbDb *dbList = hGetBlatIndexedDatabases();
struct dbDb *this = NULL;
char *saveDb = db;
char *saveOrg = organism;
struct sqlConnection *conn = hConnectCentral();
int dbCount = 0;
for(this = dbList; this; this = this->next)
{
db = this->name;
organism = hGenome(db);
+ if (!trackHubDatabase(db)) // if not hub db, make sure it is the default assembly.
+ {
char query[256];
sqlSafef(query, sizeof query, "select name from defaultDb where genome='%s'", organism);
char *defaultDb = sqlQuickString(conn, query);
if (!sameOk(defaultDb, db))
continue; // skip non-default dbs
+ }
blatSeq(skipLeadingSpaces(userSeq), organism, db, dbCount);
++dbCount;
}
dbDbFreeList(&dbList);
db = saveDb;
organism = saveOrg;
hDisconnectCentral(&conn);
// Loop over each org's default assembly
/* pre-load remote tracks in parallel */
int ptMax = atoi(cfgOptionDefault("parallelFetch.threads", "20")); // default number of threads for parallel fetch.
int pfdListCount = 0;
pthread_t *threads = NULL;
pfdListCount = slCount(pfdList);
/* launch parallel threads */
ptMax = min(ptMax, pfdListCount);
if (ptMax > 0)
{
AllocArray(threads, ptMax);
/* Create threads */
int pt;
for (pt = 0; pt < ptMax; ++pt)
{
int rc = pthread_create(&threads[pt], NULL, remoteParallelLoad, &threads[pt]);
if (rc)
{
errAbort("Unexpected error %d from pthread_create(): %s",rc,strerror(rc));
}
}
}
if (ptMax > 0)
{
/* wait for remote parallel load to finish */
remoteParallelLoadWait(atoi(cfgOptionDefault("parallelFetch.timeout", "90"))); // wait up to default 90 seconds.
}
// Should continue with pfdDone since threads could still be running that might access pdfList ?
// Hide weaker of RC'd query pairs, if not debugging.
// Combine pairs with a query and its RC.
if (!(debuggingGfResults) &&
(sameString(pfdDone->xType,"dna") || sameString(pfdDone->xType,"dnax")))
{
slSort(&pfdDone, slRcPairsCmp);
hideWeakerOfQueryRcPairs(pfdDone);
}
// requires db for chromSize, do database after multi-threading done.
changeMaxGenePositionToPositiveStrandCoords(pfdDone);
// sort by maximum hits
slSort(&pfdDone, slHitsCmp);
// Print instructions
printf("Click the link in the Assembly column to see the full BLAT output for that Genome.
\n");
// Print report // TODO move to final report at the end of ALL Assemblies
int lastSeqNumber = -1;
int idCount = 0;
char id[256];
struct genomeHits *gH = NULL;
for (gH = pfdDone; gH; gH = gH->next)
{
if (lastSeqNumber != gH->seqNumber)
{
if (lastSeqNumber != -1) // end previous table
{
printf("