50466766840ded6cb8bd5cb868bdf2ff3f613bc0 lrnassar Tue Apr 21 11:17:15 2026 -0700 QA fixes for PrimateAI-3D track. Config (primateAi.ra): - Fix broken Ensembl transcript linkout: urls $S expanded to chromosome name; switch to the Ensembl transcript page with $$ - Add numeric filters on percentile and raw score (label notes the paper's 0.821 clinical threshold) - Add maxWindowToDraw 2000000 Data (primateAiToBigBed.py): - Change hardcoded strand '+' to '.': the source file has no strand column - Accept input/output paths as CLI args (previously hardcoded the hg38 input path) - Handle variable field count: ~2.4M rows in the hg19 source are missing the refseq column Description (primateAi.html): - Fix two broken hgTrackUi&... internal links to the Zoonomia 447-way track - Regenerate the first reference via getTrackReferences (wrong article number and wrong PMC ID in the previous text) - Fix the GitHub URL for the conversion script in Methods - Move the Zoonomia 447-way mention out of Description; rephrase the license note to describe precisely what is disabled relatedTracks.ra: - Add reciprocal cross-links for primateAi <-> alphaMissense (hg38), primateAi <-> revel (hg38 + hg19), and primateAi <-> promoterAi (hg38). Also includes promoterAi <-> alphaMissense cross-links. refs #37274 #37279 diff --git src/hg/makeDb/scripts/primateai/primateAiToBigBed.py src/hg/makeDb/scripts/primateai/primateAiToBigBed.py index 4e58a7e7332..7e7e15d3229 100644 --- src/hg/makeDb/scripts/primateai/primateAiToBigBed.py +++ src/hg/makeDb/scripts/primateai/primateAiToBigBed.py @@ -1,57 +1,61 @@ #!/usr/bin/env python3 """ Convert PrimateAI-3D.hg38.txt.gz to a BED file for bigBed conversion. Input positions are 1-based, output BED is 0-based. Colors: red for pathogenic, blue for benign. """ import gzip import sys import os -INFILE = "PrimateAI-3D.hg38.txt.gz" -OUTFILE = "primateAi.bed" - def main(): - inPath = INFILE - outPath = OUTFILE + if len(sys.argv) != 3: + sys.stderr.write("usage: primateAiToBigBed.py <input.txt.gz> <output.bed>\n") + sys.exit(1) + inPath, outPath = sys.argv[1], sys.argv[2] print(f"Reading {inPath}...", file=sys.stderr) count = 0 with gzip.open(inPath, "rt") as fh, open(outPath, "w") as out: header = fh.readline() # skip header for line in fh: fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) # 1-based ref = fields[2] alt = fields[3] gene = fields[4] refAa = fields[6] altAa = fields[7] scorePai = float(fields[8]) percentile = float(fields[9]) + # hg19 source has some rows missing the refseq column (11 fields instead of 12) + if len(fields) >= 12: refSeq = fields[10] prediction = fields[11] + else: + refSeq = "" + prediction = fields[10] chromStart = pos - 1 # convert to 0-based chromEnd = pos name = f"{refAa}>{altAa}" score = int(round(percentile * 1000)) rgb = "200,0,0" if prediction == "pathogenic" else "0,0,200" mouseOver = f"{ref}>{alt} {name} score={scorePai:.3f} pct={percentile:.3f} ({prediction})" out.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{name}\t{score}\t" - f"+\t{chromStart}\t{chromEnd}\t{rgb}\t" + f".\t{chromStart}\t{chromEnd}\t{rgb}\t" f"{ref}\t{alt}\t{gene}\t{refSeq}\t{scorePai:.3f}\t{percentile:.3f}\t" f"{prediction}\t{mouseOver}\n") count += 1 if count % 10000000 == 0: print(f" {count} variants processed...", file=sys.stderr) print(f" {count} variants written to {outPath}", file=sys.stderr) print("Done. Now sort and run bedToBigBed.", file=sys.stderr) if __name__ == "__main__": main()