2197f6d5208aff4c48ccbe42e61a116d988ac392 max Tue May 19 08:23:54 2026 -0700 hubApi: add /blat endpoint with apiKey gating, format=hgblat, and known-agent bypass New src/hg/hubApi/blat.c implements /blat/<type> (dna, protein, transRna, transDna, guess) backed by the same gfServer logic as hgBlat. Key details: - Requires an apiKey for rate-limiting; botException() and botExceptionUserAgent() exempt IPs/user-agents in hg.conf (same policy as captcha bypass elsewhere in the browser stack). - Invalid apiKey returns a clean JSON 403 rather than an HTML 500 (pre-validated in hubApi.c main() before hgBotDelayTimeFrac runs). - Extra bot-delay fraction (default 0.3, 10x hubApi default) is configurable via hubApi.blatDelayFraction in hg.conf. - format=text/psl -> PSL text; format=hgblat -> byte-for-byte hgBlat?output=json shape; jsonOutputArrays=1 -> hubApi envelope with arrays (parallel to getData behaviour); default -> objects. - botExceptionUserAgent() carved out of cart.c's static isUserAgentException() into botDelay.c so non-cart callers can use it. - Cross-reference comments added in hgBlat.c and blat.c noting the shared logic so fixes get applied to both. refs #36315 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> diff --git src/hg/hubApi/blat.c src/hg/hubApi/blat.c new file mode 100644 index 00000000000..81ee7f2b58c --- /dev/null +++ src/hg/hubApi/blat.c @@ -0,0 +1,427 @@ +/* blat - /blat endpoint: run a BLAT against an assembly's gfServer and + * return PSL hits as JSON (or PSL text). This is the API-callable twin + * of hgBlat?output=json; hgBlat itself remains unchanged for backwards + * compatibility. + * + * NOTE: Much of the alignment logic here (server lookup, sequence filtering, + * gfAlign* calls, temp-file round-trip) is derived from hgBlat.c. If you + * fix a bug or change behaviour there, check whether this file needs the + * same fix. See also the reciprocal note in hgBlat.c. */ + +#include "dataApi.h" +#include "blatServers.h" +#include "fa.h" +#include "dnautil.h" +#include "dnaseq.h" +#include "psl.h" +#include "trashDir.h" +#include "genoFind.h" +#include "trackHub.h" +#include "hubConnect.h" +#include "hdb.h" +#include "fuzzyFind.h" +#include "botDelay.h" + +/* Default BLAT bottleneck fraction -- 10x the hubApi-wide default of 0.03. + * Overridable in hg.conf with hubApi.blatDelayFraction=<value>. */ +#define blatDelayFractionDefault 0.3 + +/* Server lookup is provided by findBlatServer() in hg/lib/blatServers.c, + * which is also used by hgBlat.c. */ + +/* Validated query-type token derived from the user's "type" arg. */ +struct blatType + { + boolean isTx; /* translated query */ + boolean isTxTx; /* both query and target translated (dna vs dnax) */ + boolean txTxBoth; /* dnax query -- search both strands */ + boolean qIsProt; + enum gfType qType; + enum gfType tType; + }; + +static void parseTypeArg(char *type, struct dnaSeq *firstSeq, struct blatType *bt) +/* Translate the URL path subcommand into gfType flags. Caller passes the + * lowercase token taken from /blat/<type>. "guess" means infer from the + * first sequence (matches hgBlat's "BLAT's guess"). */ +{ +ZeroVar(bt); +if (isEmpty(type)) + apiErrAbort(err400, err400Msg, + "/blat requires a query-type subcommand: " + "/blat/dna, /blat/protein, /blat/transRna, /blat/transDna, or /blat/guess"); +else if (sameWord(type, "dna")) + ; /* defaults */ +else if (sameWord(type, "protein")) + { + bt->isTx = TRUE; + bt->qIsProt = TRUE; + } +else if (sameWord(type, "transRna")) + { + bt->isTx = TRUE; + bt->isTxTx = TRUE; + } +else if (sameWord(type, "transDna")) + { + bt->isTx = TRUE; + bt->isTxTx = TRUE; + bt->txTxBoth = TRUE; + } +else if (sameWord(type, "guess")) + { + if (firstSeq != NULL) + { + bt->isTx = !seqIsDna(firstSeq); + bt->qIsProt = bt->isTx; + } + } +else + apiErrAbort(err400, err400Msg, + "do not recognize endpoint function: '/blat/%s' " + "(use dna, protein, transRna, transDna, or guess)", type); +if (bt->isTx) + { + if (bt->isTxTx) + { + bt->qType = gftDnaX; + bt->tType = gftDnaX; + } + else + { + bt->qType = gftProt; + bt->tType = gftDnaX; + } + } +else + { + bt->qType = gftDna; + bt->tType = gftDna; + } +} + +static void filterSequences(struct dnaSeq *seqList, struct blatType *bt) +/* Apply the same per-seq filtering hgBlat does before submitting to the server. */ +{ +struct dnaSeq *seq; +if (bt->isTx && !bt->isTxTx) + { + for (seq = seqList; seq != NULL; seq = seq->next) + { + seq->size = aaFilteredSize(seq->dna); + aaFilter(seq->dna, seq->dna); + toUpperN(seq->dna, seq->size); + } + } +else + { + for (seq = seqList; seq != NULL; seq = seq->next) + { + seq->size = dnaFilteredSize(seq->dna); + dnaFilter(seq->dna, seq->dna); + toLowerN(seq->dna, seq->size); + subChar(seq->dna, 'u', 't'); + } + } +if (seqList != NULL && seqList->name[0] == 0) + { + freeMem(seqList->name); + seqList->name = cloneString("YourSeq"); + } +} + +static void writePslOutput(struct psl *pslList, struct blatType *bt) +/* PSL text output path (output=psl). */ +{ +hPrintDisable(); +puts("Content-Type: text/plain\n"); +pslxWriteHead(stdout, bt->qType, bt->tType); +struct psl *psl; +int n = 0; +for (psl = pslList; psl != NULL && n < maxItemsOutput; psl = psl->next, ++n) + pslTabOut(psl, stdout); +} + +static void writeLegacyJsonOutput(struct psl *pslList, char *db) +/* Byte-for-byte the same JSON shape hgBlat?output=json emits: a top-level + * object with "track":"blat", "genome", a "fields" header array, and "blat" + * as an array of arrays (one row per PSL). + * Triggered by format=hgblat or jsonOutputArrays=1. */ +{ +hPrintDisable(); +puts("Content-Type: text/plain\n"); +pslWriteAllJson(pslList, stdout, db, TRUE); +} + +static void writePslAsObject(struct jsonWrite *jw, struct psl *psl) +/* Write one PSL hit as a JSON object with named keys. */ +{ +int b; +jsonWriteObjectStart(jw, NULL); +jsonWriteNumber(jw, "matches", psl->match); +jsonWriteNumber(jw, "misMatches", psl->misMatch); +jsonWriteNumber(jw, "repMatches", psl->repMatch); +jsonWriteNumber(jw, "nCount", psl->nCount); +jsonWriteNumber(jw, "qNumInsert", psl->qNumInsert); +jsonWriteNumber(jw, "qBaseInsert", psl->qBaseInsert); +jsonWriteNumber(jw, "tNumInsert", psl->tNumInsert); +jsonWriteNumber(jw, "tBaseInsert", psl->tBaseInsert); +jsonWriteStringf(jw, "strand", "%s", psl->strand); +jsonWriteString(jw, "qName", psl->qName); +jsonWriteNumber(jw, "qSize", psl->qSize); +jsonWriteNumber(jw, "qStart", psl->qStart); +jsonWriteNumber(jw, "qEnd", psl->qEnd); +jsonWriteString(jw, "tName", psl->tName); +jsonWriteNumber(jw, "tSize", psl->tSize); +jsonWriteNumber(jw, "tStart", psl->tStart); +jsonWriteNumber(jw, "tEnd", psl->tEnd); +jsonWriteNumber(jw, "blockCount", psl->blockCount); +jsonWriteListStart(jw, "blockSizes"); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->blockSizes[b]); +jsonWriteListEnd(jw); +jsonWriteListStart(jw, "qStarts"); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->qStarts[b]); +jsonWriteListEnd(jw); +jsonWriteListStart(jw, "tStarts"); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->tStarts[b]); +jsonWriteListEnd(jw); +jsonWriteObjectEnd(jw); +} + +static void writePslAsArray(struct jsonWrite *jw, struct psl *psl) +/* Write one PSL hit as a JSON array (jsonOutputArrays mode). + * Block arrays are integer arrays, matching hubApi conventions. */ +{ +int b; +jsonWriteListStart(jw, NULL); +jsonWriteNumber(jw, NULL, psl->match); +jsonWriteNumber(jw, NULL, psl->misMatch); +jsonWriteNumber(jw, NULL, psl->repMatch); +jsonWriteNumber(jw, NULL, psl->nCount); +jsonWriteNumber(jw, NULL, psl->qNumInsert); +jsonWriteNumber(jw, NULL, psl->qBaseInsert); +jsonWriteNumber(jw, NULL, psl->tNumInsert); +jsonWriteNumber(jw, NULL, psl->tBaseInsert); +jsonWriteString(jw, NULL, psl->strand); +jsonWriteString(jw, NULL, psl->qName); +jsonWriteNumber(jw, NULL, psl->qSize); +jsonWriteNumber(jw, NULL, psl->qStart); +jsonWriteNumber(jw, NULL, psl->qEnd); +jsonWriteString(jw, NULL, psl->tName); +jsonWriteNumber(jw, NULL, psl->tSize); +jsonWriteNumber(jw, NULL, psl->tStart); +jsonWriteNumber(jw, NULL, psl->tEnd); +jsonWriteNumber(jw, NULL, psl->blockCount); +jsonWriteListStart(jw, NULL); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->blockSizes[b]); +jsonWriteListEnd(jw); +jsonWriteListStart(jw, NULL); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->qStarts[b]); +jsonWriteListEnd(jw); +jsonWriteListStart(jw, NULL); +for (b = 0; b < psl->blockCount; ++b) + jsonWriteNumber(jw, NULL, psl->tStarts[b]); +jsonWriteListEnd(jw); +jsonWriteListEnd(jw); +} + +static char *pslFieldNames[] = { + "matches", "misMatches", "repMatches", "nCount", + "qNumInsert", "qBaseInsert", "tNumInsert", "tBaseInsert", + "strand", "qName", "qSize", "qStart", "qEnd", + "tName", "tSize", "tStart", "tEnd", + "blockCount", "blockSizes", "qStarts", "tStarts", + NULL +}; + +static void writeJsonOutput(struct psl *pslList, char *db, char *hubUrl, + char *type, struct blatType *bt) +/* JSON output path -- standard hubApi envelope plus a blat[] array. + * When jsonOutputArrays is set, each hit is an array and a "fields" key + * lists the column names (matching getData's jsonOutputArrays behaviour). + * Otherwise each hit is a named-key object. */ +{ +struct jsonWrite *jw = apiStartOutput(); +jsonWriteString(jw, "genome", db); +if (isNotEmpty(hubUrl)) + jsonWriteString(jw, "hubUrl", hubUrl); +if (isNotEmpty(type)) + jsonWriteString(jw, "type", type); +jsonWriteString(jw, "qType", + bt->qType == gftProt ? "protein" : (bt->qType == gftDnaX ? "dnax" : "dna")); +jsonWriteString(jw, "tType", + bt->tType == gftDnaX ? "dnax" : "dna"); + +if (jsonOutputArrays) + { + jsonWriteListStart(jw, "fields"); + char **fp; + for (fp = pslFieldNames; *fp != NULL; ++fp) + jsonWriteString(jw, NULL, *fp); + jsonWriteListEnd(jw); + } + +jsonWriteListStart(jw, "blat"); +struct psl *psl; +long long count = 0; +for (psl = pslList; psl != NULL && count < maxItemsOutput; psl = psl->next, ++count) + { + if (jsonOutputArrays) + writePslAsArray(jw, psl); + else + writePslAsObject(jw, psl); + } +jsonWriteListEnd(jw); +itemsReturned = count; +jsonWriteNumber(jw, "itemsReturned", (long long)count); +apiFinishOutput(0, NULL, jw); +} + +void apiBlat(char *words[MAX_PATH_INFO]) +/* '/blat' endpoint: run a BLAT alignment of userSeq against the requested + * assembly's gfServer and return PSL hits as JSON. */ +{ +char *extraArgs = verifyLegalArgs(argBlat); +if (extraArgs) + apiErrAbort(err400, err400Msg, + "extraneous arguments found for function /blat '%s'", extraArgs); + +/* /blat is gated on an apiKey -- both for attribution and to anchor the + * bot-bottleneck on something more stable than IP. Known programmatic + * clients (IGV, rtracklayer, etc.) identified via bottleneck.except IPs or + * noCaptchaAgent. user-agent patterns in hg.conf are exempt. + * For everyone else, an apiKey must be present; validity was already + * checked in main(). */ +char *apiKey = cgiOptionalString(argApiKey); +if (isEmpty(apiKey) && !botException() && !botExceptionUserAgent()) + apiErrAbort(err403, err403Msg, + "/blat requires an '%s' URL parameter. " + "Generate one under My Data > My Track Hubs > Hub Development: API Key, " + "then add it to this API call as apiKey=xxxxx. " + "Contact us if you need assistance.", argApiKey); + +/* Apply a per-BLAT bottleneck penalty on top of the global hubApi delay + * already paid in main(). Bottleneck key is the apiKey (see botDelay.c), + * so heavy users throttle themselves instead of starving everyone. */ +char *blatDelayStr = cfgOptionDefault("hubApi.blatDelayFraction", NULL); +double blatDelayFraction = blatDelayStr ? atof(blatDelayStr) : blatDelayFractionDefault; +int extraDelay = hgBotDelayTimeFrac(blatDelayFraction); +if (extraDelay > 0) + sleep1000(extraDelay); + +/* Query-type subcommand comes from the URL path, like /getData/track. + * Apache rewrites /blat/<type>?... onto PATH_INFO=/blat/<type>, so + * words[1] holds the user-supplied subcommand. */ +char *type = words[1]; + +char *genome = cgiOptionalString(argGenome); +char *userSeq = cgiOptionalString(argUserSeq); +char *format = cgiOptionalString(argFormat); +char *hubUrl = cgiOptionalString(argHubUrl); + +if (isEmpty(genome)) + apiErrAbort(err400, err400Msg, + "/blat requires '%s=<assembly>'", argGenome); +if (isEmpty(userSeq)) + apiErrAbort(err400, err400Msg, + "/blat requires '%s=<sequence>' (FASTA or raw)", argUserSeq); + +/* Cap output volume. maxItemsOutput is shared with the rest of hubApi + * and is already initialized from the URL by hubApi.c. */ + +/* If a hubUrl is given, attach the hub so trackHubDatabase() recognizes it. */ +if (isNotEmpty(hubUrl)) + { + (void) errCatchTrackHubOpen(hubUrl); + } + +/* Parse the user's sequence so we can autodetect type when needed. */ +struct dnaSeq *seqList = faSeqListFromMemTextRaw(cloneString(userSeq)); +if (seqList == NULL) + apiErrAbort(err400, err400Msg, "no parseable sequence in '%s'", argUserSeq); + +struct blatType bt; +parseTypeArg(type, seqList, &bt); +filterSequences(seqList, &bt); + +struct blatServerParams *st = findBlatServer(genome, bt.isTx); +if (st == NULL) + apiErrAbort(err400, err400Msg, + "no %s BLAT server configured for genome='%s'", + bt.isTx ? "translated" : "DNA", genome); + +/* Run alignments into a temp pslx, then read it back to drive output. + * Mirrors hgBlat's strategy so we benefit from the same gfOutputPsl path. */ +struct tempName pslTn; +trashDirFile(&pslTn, "apiBlat", "apiBlat", ".pslx"); +FILE *f = mustOpen(pslTn.forCgi, "w"); +struct gfOutput *gvo = gfOutputPsl(0, bt.qIsProt, FALSE, f, FALSE, TRUE); +pslxWriteHead(f, bt.qType, bt.tType); + +struct gfConnection *conn = gfConnect(st->host, st->port, + trackHubDatabaseToGenome(st->db), st->genomeDataDir); +struct hash *tFileCache = gfFileCacheNew(); +int minMatch = 0; /* let gfServer decide; matches hgBlat allResults path */ + +struct dnaSeq *seq; +int singleMax = bt.isTx ? 10000 : 75000; +int totalMax = singleMax * 2.5; +int total = 0; +int seqCount = 0; +int maxSeqCount = 25; +for (seq = seqList; seq != NULL; seq = seq->next) + { + if (++seqCount > maxSeqCount) + break; + if (seq->size <= 0 || seq->size > singleMax) + continue; + total += seq->size; + if (total > totalMax) + break; + if (bt.isTx) + { + if (bt.isTxTx) + { + gfAlignTransTrans(conn, st->nibDir, seq, FALSE, 5, tFileCache, gvo, + !bt.txTxBoth); + if (bt.txTxBoth) + { + reverseComplement(seq->dna, seq->size); + gfAlignTransTrans(conn, st->nibDir, seq, TRUE, 5, tFileCache, gvo, + FALSE); + } + } + else + gfAlignTrans(conn, st->nibDir, seq, 5, tFileCache, gvo); + } + else + { + gfAlignStrand(conn, st->nibDir, seq, FALSE, minMatch, tFileCache, gvo); + reverseComplement(seq->dna, seq->size); + gfAlignStrand(conn, st->nibDir, seq, TRUE, minMatch, tFileCache, gvo); + } + gfOutputQuery(gvo, f); + } +carefulClose(&f); +gfFileCacheFree(&tFileCache); +gfDisconnect(&conn); + +struct lineFile *lf = pslFileOpen(pslTn.forCgi); +struct psl *pslList = NULL, *psl; +while ((psl = pslNext(lf)) != NULL) + slAddHead(&pslList, psl); +lineFileClose(&lf); +slReverse(&pslList); + +if (sameWordOk(format, "text") || sameWordOk(format, "psl")) + writePslOutput(pslList, &bt); +else if (sameWordOk(format, "hgblat")) + writeLegacyJsonOutput(pslList, st->db); +else + writeJsonOutput(pslList, st->db, hubUrl, type, &bt); +}