4e986673a37800eacf1d36309b0cd38564a4bb1f
max
  Wed Mar 25 07:55:59 2026 -0700
PromoterAI track scripts, docs, and makeDoc; remove unused primateAiToBigBed.py, refs #37278

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/promoterAiToBigWig.py src/hg/makeDb/scripts/promoterAiToBigWig.py
new file mode 100644
index 00000000000..8b05e659916
--- /dev/null
+++ src/hg/makeDb/scripts/promoterAiToBigWig.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+"""
+Convert promoterAI_tss500.tsv.gz to 4 bedGraph files (one per alt allele)
+and a BED file for overlapping positions where transcripts disagree on score.
+
+For bigWig: picks the score with the largest absolute value per (chr, pos, alt).
+For overlap bigBed: outputs all transcript-level scores where they differ.
+
+Input positions are 1-based, output is 0-based.
+"""
+
+import gzip
+import sys
+import os
+from collections import defaultdict
+
+INFILE = "promoterAI_tss500.tsv.gz"
+
+def main():
+    print("Pass 1: Reading input and collecting scores per (chr, pos, alt)...", file=sys.stderr)
+
+    # For each (chr, pos, alt), store list of (score, gene, transcript_id)
+    variants = defaultdict(list)
+    count = 0
+    with gzip.open(INFILE, "rt") as fh:
+        fh.readline()  # skip header
+        for line in fh:
+            fields = line.rstrip("\n").split("\t")
+            chrom = fields[0]
+            pos = int(fields[1])   # 1-based
+            alt = fields[3]
+            gene = fields[4]
+            txId = fields[6]
+            score = float(fields[9])
+
+            key = (chrom, pos, alt)
+            variants[key].append((score, gene, txId))
+            count += 1
+            if count % 50000000 == 0:
+                print(f"  {count} rows read...", file=sys.stderr)
+
+    print(f"  {count} total rows, {len(variants)} unique (chr, pos, alt) combos.", file=sys.stderr)
+
+    # Pass 2: Write bedGraph files and overlap BED
+    print("Pass 2: Writing bedGraph and overlap BED files...", file=sys.stderr)
+
+    bgFiles = {}
+    for alt in "ACGT":
+        bgFiles[alt] = open(f"promoterAi_{alt}.bedGraph", "w")
+
+    overlapOut = open("promoterAi_overlaps.bed", "w")
+    overlapCount = 0
+    bgCount = 0
+
+    for (chrom, pos, alt), entries in sorted(variants.items()):
+        chromStart = pos - 1  # convert to 0-based
+        chromEnd = pos
+
+        # Pick score with largest absolute value for bigWig
+        bestScore = max(entries, key=lambda x: abs(x[0]))[0]
+        bgFiles[alt].write(f"{chrom}\t{chromStart}\t{chromEnd}\t{bestScore:.4f}\n")
+        bgCount += 1
+
+        # Check if scores differ across transcripts
+        uniqueScores = set(e[0] for e in entries)
+        if len(uniqueScores) > 1:
+            # Merge into one row: comma-separated transcripts and scores
+            gene = entries[0][1]  # use first gene name
+            txList = ",".join(e[2] for e in entries)
+            scoreList = ",".join(f"{e[0]:.4f}" for e in entries)
+            minScore = min(e[0] for e in entries)
+            maxScore = max(e[0] for e in entries)
+            # bed score from best (max abs) score
+            bedScore = int(round((bestScore + 1) / 2 * 1000))
+            bedScore = max(0, min(1000, bedScore))
+            # Color: red if best score positive, blue if negative
+            rgb = "200,0,0" if bestScore > 0 else "0,0,200"
+            scoreDiff = maxScore - minScore
+            mouseOver = f"{alt} {gene} score range {minScore:.4f} to {maxScore:.4f} (diff {scoreDiff:.4f})"
+            overlapOut.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{gene}\t{bedScore}\t"
+                             f"+\t{chromStart}\t{chromEnd}\t{rgb}\t"
+                             f"{alt}\t{txList}\t{scoreList}\t{scoreDiff:.4f}\t{mouseOver}\n")
+            overlapCount += 1
+
+    for f in bgFiles.values():
+        f.close()
+    overlapOut.close()
+
+    print(f"  {bgCount} positions written to 4 bedGraph files.", file=sys.stderr)
+    print(f"  {overlapCount} overlap entries written to promoterAi_overlaps.bed", file=sys.stderr)
+    print("Done. Now sort bedGraphs and convert to bigWig, sort BED and convert to bigBed.", file=sys.stderr)
+
+if __name__ == "__main__":
+    main()