4f8f8773bec66a9e993e9897e0b032c6e97dead8 max Fri May 15 10:12:29 2026 -0700 mei: add HMEID, SweGen, and euL1db subtracks Three new MEI catalogues under the existing mei superTrack: meiHmeid (hg38) 36,699 MELT MEIs from HMEID v1.1 (NyuWa+1KGP, 5,675 individuals, Niu et al. 2022, PMID 35212372). Site-level VCF; per-cohort and per-1KGP super- population AC/AN/AF; SVTYPE Alu/L1/SVA/HERVK. meiSwegen (hg38 lifted) 18,090 MELT MEIs from the SweGen 1,000-sample Swedish cohort (Ameur 2017, PMID 28832569; Gardner 2017, PMID 28855259). Built on hg19, liftOver to hg38 (10 unmapped). tableBrowser off per SweGen distribution terms. meiEul1db (hg19+hg38) 8,988 curated L1-HS insertion polymorphisms (MRIPs) from euL1db v1.00 (Mir 2015, PMID 25352549), aggregating 142,495 sample-level SRIPs across 32 published studies. Coloured by lineage (germline/somatic/mixed). Built on hg19, liftOver to hg38 (3 unmapped). Helman2014 used numeric chrom names (23=X, 24=Y) which are renamed during the build. meiEul1dbRef (hg19+hg38) 1,540 reference-genome L1-HS copies catalogued by euL1db (companion to meiEul1db). Single shared mei.ra (in human/) uses $D substitution so each stanza serves both assemblies where applicable. refs #37524 diff --git src/hg/makeDb/scripts/mei/meiEul1dbRefToBed.py src/hg/makeDb/scripts/mei/meiEul1dbRefToBed.py new file mode 100755 index 00000000000..7646daecb11 --- /dev/null +++ src/hg/makeDb/scripts/mei/meiEul1dbRefToBed.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +"""Convert euL1db ReferenceL1HS.txt to a hg19 BED9+ for the meiEul1dbRef track. + +These are L1-HS copies already present in the human reference genome — +not insertion polymorphisms. +""" + +import argparse +import os +import sys +from collections import defaultdict + +# Okabe-Ito colors by L1HS subgroup +COLORS = { + "L1HS-Ta": "0,114,178", # blue + "L1HS-PreTa": "230,159,0", # orange + "L1HS-undef": "153,153,153", # grey +} + + +def open_tab(path): + with open(path) as fh: + for line in fh: + if not line.strip() or line.startswith("#"): + continue + yield line.rstrip("\n").split("\t") + + +def load_chrom_sizes(path): + sizes = {} + with open(path) as fh: + for line in fh: + chrom, size = line.rstrip().split("\t")[:2] + sizes[chrom] = int(size) + return sizes + + +def main(): + ap = argparse.ArgumentParser() + ap.add_argument("--src", default="/hive/data/genomes/hg38/bed/mei/eul1db", + help="Directory containing euL1db .txt tables") + ap.add_argument("--chrom-sizes", default="/hive/data/genomes/hg19/chrom.sizes") + ap.add_argument("-o", "--out", required=True, help="Output BED9+ file") + args = ap.parse_args() + + sizes = load_chrom_sizes(args.chrom_sizes) + + n_in = 0 + n_out = 0 + n_skip_chrom = 0 + n_skip_range = 0 + dropped_chroms = defaultdict(int) + with open(args.out, "w") as out: + for cols in open_tab(os.path.join(args.src, "ReferenceL1HS.txt")): + n_in += 1 + if len(cols) < 9: + continue + chrom, start_s, stop_s, family, strand, ref_start, ref_stop, \ + integrity, sub_group = cols[:9] + chrom = "chr" + chrom + if chrom not in sizes: + dropped_chroms[chrom] += 1 + n_skip_chrom += 1 + continue + try: + start_1 = int(start_s) + stop_1 = int(stop_s) + except ValueError: + continue + bed_start = max(0, start_1 - 1) + bed_end = stop_1 + if bed_end <= bed_start: + bed_end = bed_start + 1 + if bed_end > sizes[chrom]: + n_skip_range += 1 + continue + if strand not in ("+", "-"): + strand = "." + rgb = COLORS.get(sub_group, "153,153,153") + try: + rs = int(ref_start) if ref_start not in (".", "") else 0 + re_ = int(ref_stop) if ref_stop not in (".", "") else 0 + except ValueError: + rs = re_ = 0 + elem_len = bed_end - bed_start + out.write("\t".join([ + chrom, + str(bed_start), + str(bed_end), + sub_group if sub_group else "L1HS", + "0", + strand, + str(bed_start), + str(bed_end), + rgb, + family if family else "L1HS", + sub_group if sub_group else "", + integrity if integrity else "", + str(rs), + str(re_), + str(elem_len), + ]) + "\n") + n_out += 1 + + print(f"Reference L1HS read: {n_in}", file=sys.stderr) + print(f"Reference L1HS written: {n_out}", file=sys.stderr) + print(f"Skipped (chrom not in hg19 chrom.sizes): {n_skip_chrom}", file=sys.stderr) + if dropped_chroms: + for c, n in sorted(dropped_chroms.items()): + print(f" {c}: {n}", file=sys.stderr) + print(f"Skipped (end beyond chrom): {n_skip_range}", file=sys.stderr) + + +if __name__ == "__main__": + main()