de2ccf6d827865f11d3c8edd9ceeb1b6394a7380 lrnassar Tue Apr 21 18:22:59 2026 -0700 PrimateAI-3D: label items by nucleotide change, add aaChange field and HTML mouseover. Variant analysts typically work at the nucleotide level, and the current item label (amino acid change) collapses distinguishable variants: ~17% of items share their (chrom, pos, AA-change) tuple with another item because of codon degeneracy (e.g. three C>A, C>G, C>T at the same position can all appear as "M>I"). Labeling by nucleotide change makes every item uniquely distinguishable (0.0% collisions on hg38, 0.1% on hg19 from overlapping transcripts). - primateAi.as: field 4 (name) is now "Nucleotide change (e.g. T>C)"; new field aaChange (placed before ref/alt) holds the amino acid change. - primateAiToBigBed.py: write name = "{ref}>{alt}", new aaChange column, and an HTML mouseover with terse labels (Var/AA/Score/Perc/Pred) and a colored prediction string. - primateAi.ra: add labelFields name,aaChange and defaultLabelFields name so users can toggle the on-feature label between nt change (default) and AA change. - primateAi.html: expand Display Conventions with the label-convention rationale and a legend for each mouseover field. refs #37274 diff --git src/hg/makeDb/scripts/primateai/primateAiToBigBed.py src/hg/makeDb/scripts/primateai/primateAiToBigBed.py index 7e7e15d3229..bac429bff64 100644 --- src/hg/makeDb/scripts/primateai/primateAiToBigBed.py +++ src/hg/makeDb/scripts/primateai/primateAiToBigBed.py @@ -1,61 +1,68 @@ #!/usr/bin/env python3 """ Convert PrimateAI-3D.hg38.txt.gz to a BED file for bigBed conversion. Input positions are 1-based, output BED is 0-based. Colors: red for pathogenic, blue for benign. """ import gzip import sys import os def main(): if len(sys.argv) != 3: sys.stderr.write("usage: primateAiToBigBed.py \n") sys.exit(1) inPath, outPath = sys.argv[1], sys.argv[2] print(f"Reading {inPath}...", file=sys.stderr) count = 0 with gzip.open(inPath, "rt") as fh, open(outPath, "w") as out: header = fh.readline() # skip header for line in fh: fields = line.rstrip("\n").split("\t") chrom = fields[0] pos = int(fields[1]) # 1-based ref = fields[2] alt = fields[3] gene = fields[4] refAa = fields[6] altAa = fields[7] scorePai = float(fields[8]) percentile = float(fields[9]) # hg19 source has some rows missing the refseq column (11 fields instead of 12) if len(fields) >= 12: refSeq = fields[10] prediction = fields[11] else: refSeq = "" prediction = fields[10] chromStart = pos - 1 # convert to 0-based chromEnd = pos - name = f"{refAa}>{altAa}" + name = f"{ref}>{alt}" + aaChange = f"{refAa}>{altAa}" score = int(round(percentile * 1000)) rgb = "200,0,0" if prediction == "pathogenic" else "0,0,200" + color = "#c80000" if prediction == "pathogenic" else "#0000c8" - mouseOver = f"{ref}>{alt} {name} score={scorePai:.3f} pct={percentile:.3f} ({prediction})" + mouseOver = (f"Var: {ref}>{alt}
" + f"AA: {aaChange}
" + f"Score: {scorePai:.3f}
" + f"Perc: {percentile:.3f}
" + f"Pred: {prediction}") out.write(f"{chrom}\t{chromStart}\t{chromEnd}\t{name}\t{score}\t" f".\t{chromStart}\t{chromEnd}\t{rgb}\t" - f"{ref}\t{alt}\t{gene}\t{refSeq}\t{scorePai:.3f}\t{percentile:.3f}\t" + f"{aaChange}\t{ref}\t{alt}\t{gene}\t{refSeq}\t" + f"{scorePai:.3f}\t{percentile:.3f}\t" f"{prediction}\t{mouseOver}\n") count += 1 if count % 10000000 == 0: print(f" {count} variants processed...", file=sys.stderr) print(f" {count} variants written to {outPath}", file=sys.stderr) print("Done. Now sort and run bedToBigBed.", file=sys.stderr) if __name__ == "__main__": main()