f9a89b0e1ce3c937b4fbb879736c1619c35c271f lrnassar Tue Apr 21 12:11:02 2026 -0700 QA fixes for PromoterAI track. refs #37278 Description page: replaced the wrong reference (Gao et al. 2023, the PrimateAI-3D paper) with the actual PromoterAI citation (Jaganathan et al. Science 2025, PMID 40440429), corrected the score-direction wording (negative = under-expression, positive = over-expression, not "tolerated vs disruptive"), fixed the Data Access source link (Illumina BaseSpace, not the GitHub repo), and corrected the mouseover blurb to match mouseOverFunction noAverage behavior. Converter and AS: the overlap bigBed now carries the real per-transcript strand from the source TSV (was hardcoded '+'), with a new strands column in the AS, and the name field concatenates unique gene symbols so bidirectional-promoter items read as "HES4,ISG15" etc. BED score is now |PromoterAI|*1000 so scoreFilter is meaningful. Rewrote the converter to stream (sorted input), which drops peak memory from ~40 GB to a few MB. trackDb: added filterLabel/filterLimits on scoreDiff (the filter was unusable without labels), scoreFilter + scoreLabel, alwaysZero and autoScale off on the bigWig subtracks, color 200,0,0 / altColor 0,0,200 so signed bigWig bars draw red (over-expression) above zero and blue (under-expression) below, matching the overlap track itemRgb. Added maxWindowToDraw and maxItems on the overlap subtrack. Makedoc updated to describe the streaming pipeline, the new strands column, and the rebuild workflow. diff --git src/hg/makeDb/scripts/promoterAiToBigWig.py src/hg/makeDb/scripts/promoterAiToBigWig.py index 8b05e659916..4dbb445d02c 100644 --- src/hg/makeDb/scripts/promoterAiToBigWig.py +++ src/hg/makeDb/scripts/promoterAiToBigWig.py @@ -1,94 +1,112 @@ #!/usr/bin/env python3 """ Convert promoterAI_tss500.tsv.gz to 4 bedGraph files (one per alt allele) and a BED file for overlapping positions where transcripts disagree on score. For bigWig: picks the score with the largest absolute value per (chr, pos, alt). For overlap bigBed: outputs all transcript-level scores where they differ. +Input is assumed sorted by (chrom, pos) (and rows for the same (chrom, pos) are +therefore consecutive). Processing is streaming: memory use is proportional to +the number of transcripts at a single position, not the whole file. + Input positions are 1-based, output is 0-based. +Input strand is encoded as "1" (plus) or "-1" (minus). """ import gzip import sys -import os from collections import defaultdict INFILE = "promoterAI_tss500.tsv.gz" +def strandStr(s): + return "+" if s == "1" else "-" if s == "-1" else "." + +def flushPosition(chrom, pos, byAlt, bgFiles, overlapOut, counters): + chromStart = pos - 1 + chromEnd = pos + for alt, entries in byAlt.items(): + bestScore = max(entries, key=lambda x: abs(x[0]))[0] + bgFiles[alt].write(f"{chrom}\t{chromStart}\t{chromEnd}\t{bestScore:.4f}\n") + counters["bg"] += 1 + + uniqueScores = set(e[0] for e in entries) + if len(uniqueScores) > 1: + # Unique gene names in order of first appearance + genes = [] + for e in entries: + if e[1] not in genes: + genes.append(e[1]) + geneStr = ",".join(genes) + + txList = ",".join(e[2] for e in entries) + scoreList = ",".join(f"{e[0]:.4f}" for e in entries) + strandList = ",".join(e[3] for e in entries) + + strandSet = set(e[3] for e in entries) + bedStrand = strandSet.pop() if len(strandSet) == 1 else "." + + minScore = min(e[0] for e in entries) + maxScore = max(e[0] for e in entries) + scoreDiff = maxScore - minScore + bedScore = int(round(abs(bestScore) * 1000)) + bedScore = max(0, min(1000, bedScore)) + rgb = "200,0,0" if bestScore > 0 else "0,0,200" + mouseOver = (f"{alt} {geneStr} score range {minScore:.4f} to " + f"{maxScore:.4f} (diff {scoreDiff:.4f})") + overlapOut.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{geneStr}\t{bedScore}\t" + f"{bedStrand}\t{chromStart}\t{chromEnd}\t{rgb}\t" + f"{alt}\t{txList}\t{scoreList}\t{strandList}\t" + f"{scoreDiff:.4f}\t{mouseOver}\n") + counters["overlap"] += 1 + def main(): - print("Pass 1: Reading input and collecting scores per (chr, pos, alt)...", file=sys.stderr) + print("Streaming input, writing bedGraph and overlap BED files...", file=sys.stderr) + + bgFiles = {alt: open(f"promoterAi_{alt}.bedGraph", "w") for alt in "ACGT"} + overlapOut = open("promoterAi_overlaps.bed", "w") + counters = {"bg": 0, "overlap": 0} - # For each (chr, pos, alt), store list of (score, gene, transcript_id) - variants = defaultdict(list) + prevChrom = None + prevPos = None + byAlt = defaultdict(list) count = 0 + with gzip.open(INFILE, "rt") as fh: fh.readline() # skip header for line in fh: fields = line.rstrip("\n").split("\t") chrom = fields[0] - pos = int(fields[1]) # 1-based + pos = int(fields[1]) alt = fields[3] gene = fields[4] txId = fields[6] + strand = strandStr(fields[7]) score = float(fields[9]) - key = (chrom, pos, alt) - variants[key].append((score, gene, txId)) + if chrom != prevChrom or pos != prevPos: + if prevChrom is not None: + flushPosition(prevChrom, prevPos, byAlt, bgFiles, overlapOut, counters) + byAlt = defaultdict(list) + prevChrom, prevPos = chrom, pos + + byAlt[alt].append((score, gene, txId, strand)) count += 1 if count % 50000000 == 0: print(f" {count} rows read...", file=sys.stderr) - print(f" {count} total rows, {len(variants)} unique (chr, pos, alt) combos.", file=sys.stderr) - - # Pass 2: Write bedGraph files and overlap BED - print("Pass 2: Writing bedGraph and overlap BED files...", file=sys.stderr) - - bgFiles = {} - for alt in "ACGT": - bgFiles[alt] = open(f"promoterAi_{alt}.bedGraph", "w") - - overlapOut = open("promoterAi_overlaps.bed", "w") - overlapCount = 0 - bgCount = 0 - - for (chrom, pos, alt), entries in sorted(variants.items()): - chromStart = pos - 1 # convert to 0-based - chromEnd = pos - - # Pick score with largest absolute value for bigWig - bestScore = max(entries, key=lambda x: abs(x[0]))[0] - bgFiles[alt].write(f"{chrom}\t{chromStart}\t{chromEnd}\t{bestScore:.4f}\n") - bgCount += 1 - - # Check if scores differ across transcripts - uniqueScores = set(e[0] for e in entries) - if len(uniqueScores) > 1: - # Merge into one row: comma-separated transcripts and scores - gene = entries[0][1] # use first gene name - txList = ",".join(e[2] for e in entries) - scoreList = ",".join(f"{e[0]:.4f}" for e in entries) - minScore = min(e[0] for e in entries) - maxScore = max(e[0] for e in entries) - # bed score from best (max abs) score - bedScore = int(round((bestScore + 1) / 2 * 1000)) - bedScore = max(0, min(1000, bedScore)) - # Color: red if best score positive, blue if negative - rgb = "200,0,0" if bestScore > 0 else "0,0,200" - scoreDiff = maxScore - minScore - mouseOver = f"{alt} {gene} score range {minScore:.4f} to {maxScore:.4f} (diff {scoreDiff:.4f})" - overlapOut.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{gene}\t{bedScore}\t" - f"+\t{chromStart}\t{chromEnd}\t{rgb}\t" - f"{alt}\t{txList}\t{scoreList}\t{scoreDiff:.4f}\t{mouseOver}\n") - overlapCount += 1 + if prevChrom is not None: + flushPosition(prevChrom, prevPos, byAlt, bgFiles, overlapOut, counters) for f in bgFiles.values(): f.close() overlapOut.close() - print(f" {bgCount} positions written to 4 bedGraph files.", file=sys.stderr) - print(f" {overlapCount} overlap entries written to promoterAi_overlaps.bed", file=sys.stderr) - print("Done. Now sort bedGraphs and convert to bigWig, sort BED and convert to bigBed.", file=sys.stderr) + print(f" {count} total rows read.", file=sys.stderr) + print(f" {counters['bg']} positions written to 4 bedGraph files.", file=sys.stderr) + print(f" {counters['overlap']} overlap entries written to promoterAi_overlaps.bed", + file=sys.stderr) if __name__ == "__main__": main()