4f8f8773bec66a9e993e9897e0b032c6e97dead8 max Fri May 15 10:12:29 2026 -0700 mei: add HMEID, SweGen, and euL1db subtracks Three new MEI catalogues under the existing mei superTrack: meiHmeid (hg38) 36,699 MELT MEIs from HMEID v1.1 (NyuWa+1KGP, 5,675 individuals, Niu et al. 2022, PMID 35212372). Site-level VCF; per-cohort and per-1KGP super- population AC/AN/AF; SVTYPE Alu/L1/SVA/HERVK. meiSwegen (hg38 lifted) 18,090 MELT MEIs from the SweGen 1,000-sample Swedish cohort (Ameur 2017, PMID 28832569; Gardner 2017, PMID 28855259). Built on hg19, liftOver to hg38 (10 unmapped). tableBrowser off per SweGen distribution terms. meiEul1db (hg19+hg38) 8,988 curated L1-HS insertion polymorphisms (MRIPs) from euL1db v1.00 (Mir 2015, PMID 25352549), aggregating 142,495 sample-level SRIPs across 32 published studies. Coloured by lineage (germline/somatic/mixed). Built on hg19, liftOver to hg38 (3 unmapped). Helman2014 used numeric chrom names (23=X, 24=Y) which are renamed during the build. meiEul1dbRef (hg19+hg38) 1,540 reference-genome L1-HS copies catalogued by euL1db (companion to meiEul1db). Single shared mei.ra (in human/) uses $D substitution so each stanza serves both assemblies where applicable. refs #37524 diff --git src/hg/makeDb/scripts/mei/meiHmeidVcfToBed.py src/hg/makeDb/scripts/mei/meiHmeidVcfToBed.py new file mode 100644 index 00000000000..2680531f2e4 --- /dev/null +++ src/hg/makeDb/scripts/mei/meiHmeidVcfToBed.py @@ -0,0 +1,188 @@ +#!/usr/bin/env python3 +"""Convert the HMEID v1.1 site-level MEI VCF to a bed9+ file. + +Input: MEI.GRCh38.HMEIDv1.1.vcf.gz from the HMEID database +(http://bigdata.ibp.ac.cn/HMEID/download/), which contains 36,699 +non-reference Mobile Element Insertions called by MELT v2.1.5 on +5,675 short-read whole genomes: 2,998 Chinese samples from the NyuWa +cohort (~26.2x coverage) and 2,677 samples from the 1000 Genomes +Project (~7.4x coverage). + +The VCF is site-level (no FORMAT / per-sample columns); INFO carries +per-cohort and per-1KGP-super-population AC/AN/AF as well as the MELT +fields SVTYPE (ALU/LINE1/SVA/HERVK), SVLEN, TSD and ASSESS. + +Insertions are drawn as a 1 bp anchor block, matching the lrSv / HGSVC3 +/ DeepMEI MEI tracks: + chromStart = POS - 1 (1-based VCF anchor -> 0-based) + chromEnd = chromStart + 1 +""" + +import gzip +import sys + + +COLOR_BY_CLASS = { + "Alu": "0,114,178", # blue (Okabe-Ito) + "L1": "213,94,0", # vermillion + "SVA": "0,158,115", # bluish green + "HERVK": "204,121,167", # reddish purple +} + +SVTYPE_TO_CLASS = { + "ALU": "Alu", + "LINE1": "L1", + "SVA": "SVA", + "HERVK": "HERVK", +} + + +def parseInfo(infoField): + fields = {} + for tok in infoField.split(";"): + if not tok: + continue + if "=" in tok: + k, v = tok.split("=", 1) + fields[k] = v + else: + fields[tok] = True + return fields + + +def getInt(info, key, default=0): + v = info.get(key) + if v is None or v == "" or v == ".": + return default + return int(v) + + +def getFloat(info, key, default=0.0): + v = info.get(key) + if v is None or v == "" or v == ".": + return default + return float(v) + + +def main(): + if len(sys.argv) != 4: + sys.exit("usage: meiHmeidVcfToBed.py input.vcf.gz chrom.sizes output.bed") + + inFn, sizesFn, outFn = sys.argv[1:] + + chromSizes = {} + with open(sizesFn) as f: + for line in f: + chrom, size = line.rstrip("\n").split("\t") + chromSizes[chrom] = int(size) + + nIn = 0 + nOut = 0 + nSkippedChrom = 0 + nSkippedBoundary = 0 + nSkippedSvType = 0 + classCounts = {} + + opener = gzip.open if inFn.endswith(".gz") else open + out = open(outFn, "w") + + with opener(inFn, "rt") as f: + for line in f: + if line.startswith("#"): + continue + + nIn += 1 + row = line.rstrip("\n").split("\t") + chrom = row[0] + pos = int(row[1]) + info = parseInfo(row[7]) + + svType = info.get("SVTYPE", "") + cls = SVTYPE_TO_CLASS.get(svType) + if cls is None: + nSkippedSvType += 1 + sys.stderr.write(f"WARN: skipping unknown SVTYPE {svType} at {chrom}:{pos}\n") + continue + + chromStart = pos - 1 + chromEnd = chromStart + 1 + + if chrom not in chromSizes: + nSkippedChrom += 1 + sys.stderr.write(f"WARN: skipping {chrom}:{pos}, chrom not in chrom.sizes\n") + continue + if chromEnd > chromSizes[chrom]: + nSkippedBoundary += 1 + sys.stderr.write( + f"WARN: skipping {chrom}:{pos}, end {chromEnd} > chromSize {chromSizes[chrom]}\n") + continue + + altAC = getInt(info, "AC") + AN = getInt(info, "AN") + altAF = getFloat(info, "AF") + + svLen = getInt(info, "SVLEN", -1) + tsd = info.get("TSD", ".") + if tsd in ("null", "", "NULL"): + tsd = "." + assess = getInt(info, "ASSESS") + + score = max(0, min(1000, int(round(altAF * 1000)))) + color = COLOR_BY_CLASS[cls] + name = f"{cls}-{altAC}" + classCounts[cls] = classCounts.get(cls, 0) + 1 + + fields = [ + chrom, + str(chromStart), + str(chromEnd), + name, + str(score), + ".", + str(chromStart), + str(chromEnd), + color, + cls, + str(svLen), + tsd, + str(assess), + str(altAC), + str(AN), + f"{altAF:.6f}", + str(getInt(info, "Nyuwa_AC")), + str(getInt(info, "Nyuwa_AN")), + f"{getFloat(info, 'Nyuwa_AF'):.6f}", + str(getInt(info, "1KGP_AC")), + str(getInt(info, "1KGP_AN")), + f"{getFloat(info, '1KGP_AF'):.6f}", + str(getInt(info, "AFR_AC")), + str(getInt(info, "AFR_AN")), + f"{getFloat(info, 'AFR_AF'):.6f}", + str(getInt(info, "AMR_AC")), + str(getInt(info, "AMR_AN")), + f"{getFloat(info, 'AMR_AF'):.6f}", + str(getInt(info, "EAS_AC")), + str(getInt(info, "EAS_AN")), + f"{getFloat(info, 'EAS_AF'):.6f}", + str(getInt(info, "EUR_AC")), + str(getInt(info, "EUR_AN")), + f"{getFloat(info, 'EUR_AF'):.6f}", + str(getInt(info, "SAS_AC")), + str(getInt(info, "SAS_AN")), + f"{getFloat(info, 'SAS_AF'):.6f}", + ] + out.write("\t".join(fields) + "\n") + nOut += 1 + + out.close() + + classStr = ", ".join(f"{k} {v}" for k, v in sorted(classCounts.items())) + sys.stderr.write( + f"Read {nIn} records, wrote {nOut}, " + f"skipped {nSkippedSvType} (unknown SVTYPE) + {nSkippedChrom} (chrom missing) + " + f"{nSkippedBoundary} (off chrom end)\n" + f"Class distribution: {classStr}.\n") + + +if __name__ == "__main__": + main()