f9a89b0e1ce3c937b4fbb879736c1619c35c271f
lrnassar
  Tue Apr 21 12:11:02 2026 -0700
QA fixes for PromoterAI track. refs #37278

Description page: replaced the wrong reference (Gao et al. 2023, the PrimateAI-3D
paper) with the actual PromoterAI citation (Jaganathan et al. Science 2025, PMID
40440429), corrected the score-direction wording (negative = under-expression,
positive = over-expression, not "tolerated vs disruptive"), fixed the Data Access
source link (Illumina BaseSpace, not the GitHub repo), and corrected the mouseover
blurb to match mouseOverFunction noAverage behavior.

Converter and AS: the overlap bigBed now carries the real per-transcript strand
from the source TSV (was hardcoded '+'), with a new strands column in the AS, and
the name field concatenates unique gene symbols so bidirectional-promoter items
read as "HES4,ISG15" etc. BED score is now |PromoterAI|*1000 so scoreFilter is
meaningful. Rewrote the converter to stream (sorted input), which drops peak
memory from ~40 GB to a few MB.

trackDb: added filterLabel/filterLimits on scoreDiff (the filter was unusable
without labels), scoreFilter + scoreLabel, alwaysZero and autoScale off on the
bigWig subtracks, color 200,0,0 / altColor 0,0,200 so signed bigWig bars draw
red (over-expression) above zero and blue (under-expression) below, matching
the overlap track itemRgb. Added maxWindowToDraw and maxItems on the overlap
subtrack.

Makedoc updated to describe the streaming pipeline, the new strands column,
and the rebuild workflow.

diff --git src/hg/makeDb/scripts/promoterAiToBigWig.py src/hg/makeDb/scripts/promoterAiToBigWig.py
index 8b05e659916..4dbb445d02c 100644
--- src/hg/makeDb/scripts/promoterAiToBigWig.py
+++ src/hg/makeDb/scripts/promoterAiToBigWig.py
@@ -1,94 +1,112 @@
 #!/usr/bin/env python3
 """
 Convert promoterAI_tss500.tsv.gz to 4 bedGraph files (one per alt allele)
 and a BED file for overlapping positions where transcripts disagree on score.
 
 For bigWig: picks the score with the largest absolute value per (chr, pos, alt).
 For overlap bigBed: outputs all transcript-level scores where they differ.
 
+Input is assumed sorted by (chrom, pos) (and rows for the same (chrom, pos) are
+therefore consecutive). Processing is streaming: memory use is proportional to
+the number of transcripts at a single position, not the whole file.
+
 Input positions are 1-based, output is 0-based.
+Input strand is encoded as "1" (plus) or "-1" (minus).
 """
 
 import gzip
 import sys
-import os
 from collections import defaultdict
 
 INFILE = "promoterAI_tss500.tsv.gz"
 
+def strandStr(s):
+    return "+" if s == "1" else "-" if s == "-1" else "."
+
+def flushPosition(chrom, pos, byAlt, bgFiles, overlapOut, counters):
+    chromStart = pos - 1
+    chromEnd = pos
+    for alt, entries in byAlt.items():
+        bestScore = max(entries, key=lambda x: abs(x[0]))[0]
+        bgFiles[alt].write(f"{chrom}\t{chromStart}\t{chromEnd}\t{bestScore:.4f}\n")
+        counters["bg"] += 1
+
+        uniqueScores = set(e[0] for e in entries)
+        if len(uniqueScores) > 1:
+            # Unique gene names in order of first appearance
+            genes = []
+            for e in entries:
+                if e[1] not in genes:
+                    genes.append(e[1])
+            geneStr = ",".join(genes)
+
+            txList = ",".join(e[2] for e in entries)
+            scoreList = ",".join(f"{e[0]:.4f}" for e in entries)
+            strandList = ",".join(e[3] for e in entries)
+
+            strandSet = set(e[3] for e in entries)
+            bedStrand = strandSet.pop() if len(strandSet) == 1 else "."
+
+            minScore = min(e[0] for e in entries)
+            maxScore = max(e[0] for e in entries)
+            scoreDiff = maxScore - minScore
+            bedScore = int(round(abs(bestScore) * 1000))
+            bedScore = max(0, min(1000, bedScore))
+            rgb = "200,0,0" if bestScore > 0 else "0,0,200"
+            mouseOver = (f"{alt} {geneStr} score range {minScore:.4f} to "
+                         f"{maxScore:.4f} (diff {scoreDiff:.4f})")
+            overlapOut.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{geneStr}\t{bedScore}\t"
+                             f"{bedStrand}\t{chromStart}\t{chromEnd}\t{rgb}\t"
+                             f"{alt}\t{txList}\t{scoreList}\t{strandList}\t"
+                             f"{scoreDiff:.4f}\t{mouseOver}\n")
+            counters["overlap"] += 1
+
 def main():
-    print("Pass 1: Reading input and collecting scores per (chr, pos, alt)...", file=sys.stderr)
+    print("Streaming input, writing bedGraph and overlap BED files...", file=sys.stderr)
+
+    bgFiles = {alt: open(f"promoterAi_{alt}.bedGraph", "w") for alt in "ACGT"}
+    overlapOut = open("promoterAi_overlaps.bed", "w")
+    counters = {"bg": 0, "overlap": 0}
 
-    # For each (chr, pos, alt), store list of (score, gene, transcript_id)
-    variants = defaultdict(list)
+    prevChrom = None
+    prevPos = None
+    byAlt = defaultdict(list)
     count = 0
+
     with gzip.open(INFILE, "rt") as fh:
         fh.readline()  # skip header
         for line in fh:
             fields = line.rstrip("\n").split("\t")
             chrom = fields[0]
-            pos = int(fields[1])   # 1-based
+            pos = int(fields[1])
             alt = fields[3]
             gene = fields[4]
             txId = fields[6]
+            strand = strandStr(fields[7])
             score = float(fields[9])
 
-            key = (chrom, pos, alt)
-            variants[key].append((score, gene, txId))
+            if chrom != prevChrom or pos != prevPos:
+                if prevChrom is not None:
+                    flushPosition(prevChrom, prevPos, byAlt, bgFiles, overlapOut, counters)
+                byAlt = defaultdict(list)
+                prevChrom, prevPos = chrom, pos
+
+            byAlt[alt].append((score, gene, txId, strand))
             count += 1
             if count % 50000000 == 0:
                 print(f"  {count} rows read...", file=sys.stderr)
 
-    print(f"  {count} total rows, {len(variants)} unique (chr, pos, alt) combos.", file=sys.stderr)
-
-    # Pass 2: Write bedGraph files and overlap BED
-    print("Pass 2: Writing bedGraph and overlap BED files...", file=sys.stderr)
-
-    bgFiles = {}
-    for alt in "ACGT":
-        bgFiles[alt] = open(f"promoterAi_{alt}.bedGraph", "w")
-
-    overlapOut = open("promoterAi_overlaps.bed", "w")
-    overlapCount = 0
-    bgCount = 0
-
-    for (chrom, pos, alt), entries in sorted(variants.items()):
-        chromStart = pos - 1  # convert to 0-based
-        chromEnd = pos
-
-        # Pick score with largest absolute value for bigWig
-        bestScore = max(entries, key=lambda x: abs(x[0]))[0]
-        bgFiles[alt].write(f"{chrom}\t{chromStart}\t{chromEnd}\t{bestScore:.4f}\n")
-        bgCount += 1
-
-        # Check if scores differ across transcripts
-        uniqueScores = set(e[0] for e in entries)
-        if len(uniqueScores) > 1:
-            # Merge into one row: comma-separated transcripts and scores
-            gene = entries[0][1]  # use first gene name
-            txList = ",".join(e[2] for e in entries)
-            scoreList = ",".join(f"{e[0]:.4f}" for e in entries)
-            minScore = min(e[0] for e in entries)
-            maxScore = max(e[0] for e in entries)
-            # bed score from best (max abs) score
-            bedScore = int(round((bestScore + 1) / 2 * 1000))
-            bedScore = max(0, min(1000, bedScore))
-            # Color: red if best score positive, blue if negative
-            rgb = "200,0,0" if bestScore > 0 else "0,0,200"
-            scoreDiff = maxScore - minScore
-            mouseOver = f"{alt} {gene} score range {minScore:.4f} to {maxScore:.4f} (diff {scoreDiff:.4f})"
-            overlapOut.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{gene}\t{bedScore}\t"
-                             f"+\t{chromStart}\t{chromEnd}\t{rgb}\t"
-                             f"{alt}\t{txList}\t{scoreList}\t{scoreDiff:.4f}\t{mouseOver}\n")
-            overlapCount += 1
+    if prevChrom is not None:
+        flushPosition(prevChrom, prevPos, byAlt, bgFiles, overlapOut, counters)
 
     for f in bgFiles.values():
         f.close()
     overlapOut.close()
 
-    print(f"  {bgCount} positions written to 4 bedGraph files.", file=sys.stderr)
-    print(f"  {overlapCount} overlap entries written to promoterAi_overlaps.bed", file=sys.stderr)
-    print("Done. Now sort bedGraphs and convert to bigWig, sort BED and convert to bigBed.", file=sys.stderr)
+    print(f"  {count} total rows read.", file=sys.stderr)
+    print(f"  {counters['bg']} positions written to 4 bedGraph files.", file=sys.stderr)
+    print(f"  {counters['overlap']} overlap entries written to promoterAi_overlaps.bed",
+          file=sys.stderr)
 
 if __name__ == "__main__":
     main()