99db113e49bd1dd42a3040eaed1e61188a117ecf max Wed May 13 05:44:18 2026 -0700 mucins: new locus-specific mucin VNTR/non-VNTR exon track on hs1 (alpha) Adds a new 'Locus-specific' container in the genes group with two bigGenePred subtracks (mucinsVntr, mucinsNonVntr) derived from Plender et al. 2024 supplementary table 2. The .ra and HTML files live under trackDb/human/ so future locus-specific tracks can share them; hs1 picks them up via include ../mucins.ra alpha. refs #37540 diff --git src/hg/makeDb/scripts/mucins/mucinsToBigGenePred.py src/hg/makeDb/scripts/mucins/mucinsToBigGenePred.py new file mode 100755 index 00000000000..dd9a7bf49fe --- /dev/null +++ src/hg/makeDb/scripts/mucins/mucinsToBigGenePred.py @@ -0,0 +1,106 @@ +#!/usr/bin/env python3 +"""Convert the locus-specific mucin exon table to two bigGenePred bed files, +one for VNTR exons and one for non-VNTR exons. + +Input columns (tab-separated, CRLF line endings, one header row): + gene transcript exon chr start stop size strand VNTR_status + +Each exon becomes a single-block bigGenePred entry. Coordinates in the +source file are 0-based half-open. +""" +import argparse +import sys +from pathlib import Path + +VNTR_COLOR = "213,94,0" # Okabe-Ito vermillion +NON_VNTR_COLOR = "0,114,178" # Okabe-Ito blue + + +def convert(in_path: Path, out_vntr: Path, out_non_vntr: Path, + chrom_sizes: Path) -> None: + sizes = {} + with open(chrom_sizes) as fh: + for line in fh: + parts = line.rstrip().split("\t") + if len(parts) >= 2: + sizes[parts[0]] = int(parts[1]) + + n_vntr = 0 + n_non_vntr = 0 + skipped = 0 + with open(in_path, newline="") as fh, \ + open(out_vntr, "w") as fv, \ + open(out_non_vntr, "w") as fn: + header = fh.readline() # discard + for raw in fh: + row = raw.rstrip("\r\n").split("\t") + if len(row) < 9: + skipped += 1 + continue + gene, transcript, exon, chrom, start, stop, size, strand, status = row[:9] + start_i = int(start) + stop_i = int(stop) + chrom_size = sizes.get(chrom) + if chrom_size is None or stop_i > chrom_size: + sys.stderr.write( + f"WARN: {gene} {transcript} {exon} {chrom}:{start_i}-{stop_i} " + f"exceeds chrom size {chrom_size}; skipping\n") + skipped += 1 + continue + if stop_i - start_i != int(size): + sys.stderr.write( + f"WARN: size mismatch {gene} {exon}: " + f"{stop_i-start_i} vs {size}\n") + + is_vntr = status == "VNTR_exon" + color = VNTR_COLOR if is_vntr else NON_VNTR_COLOR + length = stop_i - start_i + name = f"{gene}_{exon}" + fields = [ + chrom, + str(start_i), + str(stop_i), + name, + "0", # score + strand, + str(start_i), # thickStart + str(stop_i), # thickEnd + color, + "1", # blockCount + f"{length},", # blockSizes + "0,", # chromStarts + gene, # name2 + "none", # cdsStartStat + "none", # cdsEndStat + "-1,", # exonFrames + status, # type + transcript, # geneName + gene, # geneName2 + "mucin", # geneType + ] + out_line = "\t".join(fields) + "\n" + if is_vntr: + fv.write(out_line) + n_vntr += 1 + else: + fn.write(out_line) + n_non_vntr += 1 + + sys.stderr.write( + f"Wrote {n_vntr} VNTR exons and {n_non_vntr} non-VNTR exons " + f"({skipped} skipped)\n") + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("input") + ap.add_argument("--out-vntr", required=True) + ap.add_argument("--out-non-vntr", required=True) + ap.add_argument("--chrom-sizes", required=True) + args = ap.parse_args() + convert(Path(args.input), Path(args.out_vntr), + Path(args.out_non_vntr), Path(args.chrom_sizes)) + + +if __name__ == "__main__": + main()