6af4f6e10518d33db653c55dadab86b3ed4617ec
max
  Wed Jun 3 05:39:35 2026 -0700
hubApi/blat: fix duplicate itemsReturned, missing target strand, weak default penalty

Three QA issues from Gerardo on #36315:

- writeJsonOutput() emitted "itemsReturned" twice -- apiFinishOutput()
already prints it from the global -- so drop the explicit jsonWriteNumber().

- Protein and translated queries came back with single-char strand because
gfOutput only writes the target-strand character when reportTargetStrand
is set; hgBlat sets it for translated queries, the API path didn't. Set
it when bt.isTx so pslLoad sees the full two-char strand.

- blatDelayFractionDefault was 0.3 -- QA's 40-request burst only added
~2.4s. Bumped to 3.0 for ~24s under the same burst; still tunable in
hg.conf via hubApi.blatDelayFraction=.

refs #36315

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

diff --git src/hg/hubApi/blat.c src/hg/hubApi/blat.c
index 907623b2446..4dfe07a55a9 100644
--- src/hg/hubApi/blat.c
+++ src/hg/hubApi/blat.c
@@ -1,454 +1,462 @@
 /* blat - /blat endpoint: run a BLAT against an assembly's gfServer and
  * return PSL hits as JSON (or PSL text).  This is the API-callable twin
  * of hgBlat?output=json; hgBlat's CGI interface and behavior remain unchanged.
  *
  * NOTE: Much of the alignment logic here (server lookup, sequence filtering,
  * gfAlign* calls, temp-file round-trip) is derived from hgBlat.c.  If you
  * fix a bug or change behaviour there, check whether this file needs the
  * same fix.  See also the reciprocal note in hgBlat.c. */
 
 #include "dataApi.h"
 #include "blatServers.h"
 #include "fa.h"
 #include "dnautil.h"
 #include "dnaseq.h"
 #include "psl.h"
 #include "trashDir.h"
 #include "genoFind.h"
 #include "trackHub.h"
 #include "hubConnect.h"
 #include "hdb.h"
 #include "fuzzyFind.h"
 #include "botDelay.h"
 
-/* Default BLAT bottleneck fraction -- 10x the hubApi-wide default of 0.03.
+/* Default BLAT bottleneck fraction -- 100x the hubApi-wide default of 0.03.
+ * QA-tested at 0.3 (~2.4s extra over a 40-request burst) was too gentle to
+ * deter a single-key hammer; 3.0 lands a meaningful ~24s penalty for the
+ * same burst without breaking legitimate IGV-style polling.
  * Overridable in hg.conf with hubApi.blatDelayFraction=<value>. */
-#define blatDelayFractionDefault 0.3
+#define blatDelayFractionDefault 3.0
 
 /* Server lookup is provided by findBlatServer() in hg/lib/blatServers.c,
  * which is also used by hgBlat.c. */
 
 /* Validated query-type token derived from the user's "type" arg. */
 struct blatType
     {
     boolean isTx;       /* translated query */
     boolean isTxTx;     /* both query and target translated (dna vs dnax) */
     boolean txTxBoth;   /* dnax query -- search both strands */
     boolean qIsProt;
     enum gfType qType;
     enum gfType tType;
     };
 
 static void parseTypeArg(char *type, struct dnaSeq *firstSeq, struct blatType *bt)
 /* Translate the URL path subcommand into gfType flags.  Caller passes the
  * lowercase token taken from /blat/<type>.  "guess" means infer from the
  * first sequence (matches hgBlat's "BLAT's guess"). */
 {
 ZeroVar(bt);
 if (isEmpty(type))
     apiErrAbort(err400, err400Msg,
         "/blat requires a query-type subcommand: "
         "/blat/dna, /blat/protein, /blat/transRna, /blat/transDna, or /blat/guess");
 else if (sameWord(type, "dna"))
     ;  /* defaults */
 else if (sameWord(type, "protein"))
     {
     bt->isTx = TRUE;
     bt->qIsProt = TRUE;
     }
 else if (sameWord(type, "transRna"))
     {
     bt->isTx = TRUE;
     bt->isTxTx = TRUE;
     }
 else if (sameWord(type, "transDna"))
     {
     bt->isTx = TRUE;
     bt->isTxTx = TRUE;
     bt->txTxBoth = TRUE;
     }
 else if (sameWord(type, "guess"))
     {
     if (firstSeq != NULL)
         {
         bt->isTx = !seqIsDna(firstSeq);
         bt->qIsProt = bt->isTx;
         }
     }
 else
     apiErrAbort(err400, err400Msg,
         "do not recognize endpoint function: '/blat/%s' "
         "(use dna, protein, transRna, transDna, or guess)", type);
 if (bt->isTx)
     {
     if (bt->isTxTx)
         {
         bt->qType = gftDnaX;
         bt->tType = gftDnaX;
         }
     else
         {
         bt->qType = gftProt;
         bt->tType = gftDnaX;
         }
     }
 else
     {
     bt->qType = gftDna;
     bt->tType = gftDna;
     }
 }
 
 static void filterSequences(struct dnaSeq *seqList, struct blatType *bt)
 /* Apply the same per-seq filtering hgBlat does before submitting to the server. */
 {
 struct dnaSeq *seq;
 if (bt->isTx && !bt->isTxTx)
     {
     for (seq = seqList; seq != NULL; seq = seq->next)
         {
         seq->size = aaFilteredSize(seq->dna);
         aaFilter(seq->dna, seq->dna);
         toUpperN(seq->dna, seq->size);
         }
     }
 else
     {
     for (seq = seqList; seq != NULL; seq = seq->next)
         {
         seq->size = dnaFilteredSize(seq->dna);
         dnaFilter(seq->dna, seq->dna);
         toLowerN(seq->dna, seq->size);
         subChar(seq->dna, 'u', 't');
         }
     }
 if (seqList != NULL && seqList->name[0] == 0)
     {
     freeMem(seqList->name);
     seqList->name = cloneString("YourSeq");
     }
 }
 
 static void writePslOutput(struct psl *pslList, struct blatType *bt)
 /* PSL text output path (output=psl). */
 {
 hPrintDisable();
 puts("Content-Type:text/plain\n");
 pslxWriteHead(stdout, bt->qType, bt->tType);
 struct psl *psl;
 int n = 0;
 for (psl = pslList; psl != NULL && n < maxItemsOutput; psl = psl->next, ++n)
     pslTabOut(psl, stdout);
 }
 
 static void writeLegacyJsonOutput(struct psl *pslList, char *db)
 /* Byte-for-byte the same JSON shape hgBlat?output=json emits: a top-level
  * object with "track":"blat", "genome", a "fields" header array, and "blat"
  * as an array of arrays (one row per PSL).
  * Triggered by format=hgblat or jsonOutputArrays=1. */
 {
 hPrintDisable();
 puts("Content-Type:text/plain\n");
 pslWriteAllJson(pslList, stdout, db, TRUE);
 }
 
 static void writePslAsObject(struct jsonWrite *jw, struct psl *psl)
 /* Write one PSL hit as a JSON object with named keys. */
 {
 int b;
 jsonWriteObjectStart(jw, NULL);
 jsonWriteNumber(jw, "matches", psl->match);
 jsonWriteNumber(jw, "misMatches", psl->misMatch);
 jsonWriteNumber(jw, "repMatches", psl->repMatch);
 jsonWriteNumber(jw, "nCount", psl->nCount);
 jsonWriteNumber(jw, "qNumInsert", psl->qNumInsert);
 jsonWriteNumber(jw, "qBaseInsert", psl->qBaseInsert);
 jsonWriteNumber(jw, "tNumInsert", psl->tNumInsert);
 jsonWriteNumber(jw, "tBaseInsert", psl->tBaseInsert);
 jsonWriteStringf(jw, "strand", "%s", psl->strand);
 jsonWriteString(jw, "qName", psl->qName);
 jsonWriteNumber(jw, "qSize", psl->qSize);
 jsonWriteNumber(jw, "qStart", psl->qStart);
 jsonWriteNumber(jw, "qEnd", psl->qEnd);
 jsonWriteString(jw, "tName", psl->tName);
 jsonWriteNumber(jw, "tSize", psl->tSize);
 jsonWriteNumber(jw, "tStart", psl->tStart);
 jsonWriteNumber(jw, "tEnd", psl->tEnd);
 jsonWriteNumber(jw, "blockCount", psl->blockCount);
 jsonWriteListStart(jw, "blockSizes");
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->blockSizes[b]);
 jsonWriteListEnd(jw);
 jsonWriteListStart(jw, "qStarts");
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->qStarts[b]);
 jsonWriteListEnd(jw);
 jsonWriteListStart(jw, "tStarts");
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->tStarts[b]);
 jsonWriteListEnd(jw);
 jsonWriteObjectEnd(jw);
 }
 
 static void writePslAsArray(struct jsonWrite *jw, struct psl *psl)
 /* Write one PSL hit as a JSON array (jsonOutputArrays mode).
  * Block arrays are integer arrays, matching hubApi conventions. */
 {
 int b;
 jsonWriteListStart(jw, NULL);
 jsonWriteNumber(jw, NULL, psl->match);
 jsonWriteNumber(jw, NULL, psl->misMatch);
 jsonWriteNumber(jw, NULL, psl->repMatch);
 jsonWriteNumber(jw, NULL, psl->nCount);
 jsonWriteNumber(jw, NULL, psl->qNumInsert);
 jsonWriteNumber(jw, NULL, psl->qBaseInsert);
 jsonWriteNumber(jw, NULL, psl->tNumInsert);
 jsonWriteNumber(jw, NULL, psl->tBaseInsert);
 jsonWriteString(jw, NULL, psl->strand);
 jsonWriteString(jw, NULL, psl->qName);
 jsonWriteNumber(jw, NULL, psl->qSize);
 jsonWriteNumber(jw, NULL, psl->qStart);
 jsonWriteNumber(jw, NULL, psl->qEnd);
 jsonWriteString(jw, NULL, psl->tName);
 jsonWriteNumber(jw, NULL, psl->tSize);
 jsonWriteNumber(jw, NULL, psl->tStart);
 jsonWriteNumber(jw, NULL, psl->tEnd);
 jsonWriteNumber(jw, NULL, psl->blockCount);
 jsonWriteListStart(jw, NULL);
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->blockSizes[b]);
 jsonWriteListEnd(jw);
 jsonWriteListStart(jw, NULL);
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->qStarts[b]);
 jsonWriteListEnd(jw);
 jsonWriteListStart(jw, NULL);
 for (b = 0; b < psl->blockCount; ++b)
     jsonWriteNumber(jw, NULL, psl->tStarts[b]);
 jsonWriteListEnd(jw);
 jsonWriteListEnd(jw);
 }
 
 static char *pslFieldNames[] = {
     "matches", "misMatches", "repMatches", "nCount",
     "qNumInsert", "qBaseInsert", "tNumInsert", "tBaseInsert",
     "strand", "qName", "qSize", "qStart", "qEnd",
     "tName", "tSize", "tStart", "tEnd",
     "blockCount", "blockSizes", "qStarts", "tStarts",
     NULL
 };
 
 static void writeJsonOutput(struct psl *pslList, char *db, char *hubUrl,
     char *type, struct blatType *bt)
 /* JSON output path -- standard hubApi envelope plus a blat[] array.
  * When jsonOutputArrays is set, each hit is an array and a "fields" key
  * lists the column names (matching getData's jsonOutputArrays behaviour).
  * Otherwise each hit is a named-key object. */
 {
 struct jsonWrite *jw = apiStartOutput();
 jsonWriteString(jw, "genome", db);
 if (isNotEmpty(hubUrl))
     jsonWriteString(jw, "hubUrl", hubUrl);
 if (isNotEmpty(type))
     jsonWriteString(jw, "type", type);
 jsonWriteString(jw, "qType",
     bt->qType == gftProt ? "protein" : (bt->qType == gftDnaX ? "dnax" : "dna"));
 jsonWriteString(jw, "tType",
     bt->tType == gftDnaX ? "dnax" : "dna");
 
 if (jsonOutputArrays)
     {
     jsonWriteListStart(jw, "fields");
     char **fp;
     for (fp = pslFieldNames; *fp != NULL; ++fp)
         jsonWriteString(jw, NULL, *fp);
     jsonWriteListEnd(jw);
     }
 
 jsonWriteListStart(jw, "blat");
 struct psl *psl;
 long long count = 0;
 for (psl = pslList; psl != NULL && count < maxItemsOutput; psl = psl->next, ++count)
     {
     if (jsonOutputArrays)
         writePslAsArray(jw, psl);
     else
         writePslAsObject(jw, psl);
     }
 jsonWriteListEnd(jw);
 itemsReturned = count;
-jsonWriteNumber(jw, "itemsReturned", (long long)count);
 apiFinishOutput(0, NULL, jw);
 }
 
 void apiBlat(char *words[MAX_PATH_INFO])
 /* '/blat' endpoint: run a BLAT alignment of userSeq against the requested
  * assembly's gfServer and return PSL hits as JSON. */
 {
 char *extraArgs = verifyLegalArgs(argBlat);
 if (extraArgs)
     apiErrAbort(err400, err400Msg,
         "extraneous arguments found for function /blat '%s'", extraArgs);
 
 /* /blat is gated on an apiKey -- both for attribution and to anchor the
  * bot-bottleneck on something more stable than IP.  Known programmatic
  * clients (IGV, rtracklayer, etc.) identified via bottleneck.except IPs or
  * noCaptchaAgent. user-agent patterns in hg.conf are exempt.
  * For everyone else, an apiKey must be present; validity was already
  * checked in main(). */
 char *apiKey = cgiOptionalString(argApiKey);
 if (isEmpty(apiKey) && !botException() && !botExceptionUserAgent())
     apiErrAbort(err403, err403Msg,
         "/blat requires an '%s' URL parameter. "
         "Generate one under My Data > My Track Hubs > Hub Development: API Key, "
         "then add it to this API call as apiKey=xxxxx. "
         "Contact us if you need assistance.", argApiKey);
 
 /* Apply a per-BLAT bottleneck penalty on top of the global hubApi delay
  * already paid in main().  Bottleneck key is the apiKey (see botDelay.c),
  * so heavy users throttle themselves instead of starving everyone. */
 char *blatDelayStr = cfgOptionDefault("hubApi.blatDelayFraction", NULL);
 double blatDelayFraction = blatDelayStr ? atof(blatDelayStr) : blatDelayFractionDefault;
 int extraDelay = 0;
 struct errCatch *bnErrCatch = errCatchNew();
 if (errCatchStart(bnErrCatch))
     extraDelay = hgBotDelayTimeFrac(blatDelayFraction);
 errCatchEnd(bnErrCatch);
 if (bnErrCatch->gotError)
     apiErrAbort(err500, err500Msg, "bottleneck server unavailable: %s",
         bnErrCatch->message->string);
 errCatchFree(&bnErrCatch);
 if (extraDelay > 0)
     sleep1000(extraDelay);
 
 /* Query-type subcommand comes from the URL path, like /getData/track.
  * Apache rewrites /blat/<type>?... onto PATH_INFO=/blat/<type>, so
  * words[1] holds the user-supplied subcommand. */
 char *type = words[1];
 
 char *genome = cgiOptionalString(argGenome);
 char *userSeq = cgiOptionalString(argUserSeq);
 char *format = cgiOptionalString(argFormat);
 char *hubUrl = cgiOptionalString(argHubUrl);
 
 if (isEmpty(genome))
     apiErrAbort(err400, err400Msg,
         "/blat requires '%s=<assembly>'", argGenome);
 if (isEmpty(userSeq))
     apiErrAbort(err400, err400Msg,
         "/blat requires '%s=<sequence>' (FASTA or raw)", argUserSeq);
 
 /* Cap output volume.  maxItemsOutput is shared with the rest of hubApi
  * and is already initialized from the URL by hubApi.c. */
 
 /* If a hubUrl is given, attach the hub so trackHubDatabase() recognizes it. */
 if (isNotEmpty(hubUrl))
     {
     (void) errCatchTrackHubOpen(hubUrl);
     }
 
 /* Parse the user's sequence so we can autodetect type when needed. */
 struct dnaSeq *seqList = faSeqListFromMemTextRaw(cloneString(userSeq));
 if (seqList == NULL)
     apiErrAbort(err400, err400Msg, "no parseable sequence in '%s'", argUserSeq);
 
 struct blatType bt;
 parseTypeArg(type, seqList, &bt);
 filterSequences(seqList, &bt);
 
 struct blatServerParams *st = findBlatServer(genome, bt.isTx);
 if (st == NULL)
     apiErrAbort(err400, err400Msg,
         "no %s BLAT server configured for genome='%s'",
         bt.isTx ? "translated" : "DNA", genome);
 
 /* Run alignments into a temp pslx, then read it back to drive output.
  * Mirrors hgBlat's strategy so we benefit from the same gfOutputPsl path. */
 struct tempName pslTn;
 trashDirFile(&pslTn, "apiBlat", "apiBlat", ".pslx");
 
 int maxSeqCount = 25;
 char *optionMaxSeqCount = cfgOptionDefault("hgBlat.maxSequenceCount", NULL);
 if (isNotEmpty(optionMaxSeqCount))
     maxSeqCount = atoi(optionMaxSeqCount);
 
 FILE *f = NULL;
 struct errCatch *ec = errCatchNew();
 if (errCatchStart(ec))
     {
     f = mustOpen(pslTn.forCgi, "w");
     struct gfOutput *gvo = gfOutputPsl(0, bt.qIsProt, FALSE, f, FALSE, TRUE);
+    /* For translated/protein queries the target strand needs to go into the
+     * PSL strand column too -- otherwise pslLoad sees only the query strand
+     * and downstream consumers compute the wrong coords on minus-strand hits.
+     * hgBlat sets this for the same reason (hgBlat.c near gfAlignTrans*). */
+    if (bt.isTx)
+        gvo->reportTargetStrand = TRUE;
     pslxWriteHead(f, bt.qType, bt.tType);
 
     struct gfConnection *conn = gfConnect(st->host, st->port,
         trackHubDatabaseToGenome(st->db), st->genomeDataDir);
     struct hash *tFileCache = gfFileCacheNew();
     int minMatch = 0;  /* let gfServer decide; matches hgBlat allResults path */
 
     struct dnaSeq *seq;
     int singleMax = bt.isTx ? 10000 : 75000;
     int totalMax  = singleMax * 2.5;
     int total = 0;
     int seqCount = 0;
     for (seq = seqList; seq != NULL; seq = seq->next)
         {
         if (++seqCount > maxSeqCount)
             break;
         if (seq->size <= 0 || seq->size > singleMax)
             continue;
         total += seq->size;
         if (total > totalMax)
             break;
         if (bt.isTx)
             {
             if (bt.isTxTx)
                 {
                 gfAlignTransTrans(conn, st->nibDir, seq, FALSE, 5, tFileCache, gvo,
                     !bt.txTxBoth);
                 if (bt.txTxBoth)
                     {
                     reverseComplement(seq->dna, seq->size);
                     gfAlignTransTrans(conn, st->nibDir, seq, TRUE, 5, tFileCache, gvo,
                         FALSE);
                     }
                 }
             else
                 gfAlignTrans(conn, st->nibDir, seq, 5, tFileCache, gvo);
             }
         else
             {
             gfAlignStrand(conn, st->nibDir, seq, FALSE, minMatch, tFileCache, gvo);
             reverseComplement(seq->dna, seq->size);
             gfAlignStrand(conn, st->nibDir, seq, TRUE, minMatch, tFileCache, gvo);
             }
         gfOutputQuery(gvo, f);
         }
     carefulClose(&f);
     f = NULL;
     gfFileCacheFree(&tFileCache);
     gfDisconnect(&conn);
     }
 errCatchEnd(ec);
 if (ec->gotError)
     {
     if (f != NULL)
         carefulClose(&f);
     remove(pslTn.forCgi);
     apiErrAbort(err500, err500Msg, "BLAT server error: %s", ec->message->string);
     }
 errCatchFree(&ec);
 
 struct lineFile *lf = pslFileOpen(pslTn.forCgi);
 struct psl *pslList = NULL, *psl;
 while ((psl = pslNext(lf)) != NULL)
     slAddHead(&pslList, psl);
 lineFileClose(&lf);
 slReverse(&pslList);
 
 if (sameWordOk(format, "text") || sameWordOk(format, "psl"))
     writePslOutput(pslList, &bt);
 else if (sameWordOk(format, "hgblat"))
     writeLegacyJsonOutput(pslList, st->db);
 else
     writeJsonOutput(pslList, st->db, hubUrl, type, &bt);
 }