src/hg/makeDb/scripts/varFreqs/hrcToVcf.py aa61ebc800429515f9ced7e28f669c6042219f43

aa61ebc800429515f9ced7e28f669c6042219f43
max
  Wed Mar 18 09:09:13 2026 -0700
varFreqs supertrack: add GREGoR track, update all HTML docs, move scripts to varFreqs/, refs #36642

Add GREGoR R04 WGS track to varFreqs superTrack. Update Data Access and
Methods sections for all 20+ subtrack HTML files with consistent formatting,
sequencing methods from source papers, and links to makeDoc and Github scripts.
Move all varFreqs conversion scripts into scripts/varFreqs/ subdirectory and
update makeDoc paths accordingly.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/varFreqs/hrcToVcf.py src/hg/makeDb/scripts/varFreqs/hrcToVcf.py
new file mode 100755
index 00000000000..0fb4fe3c98d
--- /dev/null
+++ src/hg/makeDb/scripts/varFreqs/hrcToVcf.py
@@ -0,0 +1,126 @@
+#!/usr/bin/env python3
+"""
+Convert HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz to VCF format,
+lifting coordinates from hg19 (GRCh37) to hg38 using UCSC liftOver.
+"""
+
+import gzip
+import subprocess
+import sys
+import tempfile
+import os
+
+INFILE = "HRC.r1-1.GRCh37.wgs.mac5.sites.tab.gz"
+OUTFILE = "hrc.vcf"
+CHAIN = "/gbdb/hg19/liftOver/hg19ToHg38.over.chain.gz"
+CHROMSIZES = "/hive/data/genomes/hg38/chrom.sizes"
+LIFTOVER = "liftOver"
+MIN_MATCH = "0.95"
+
+def main():
+    inPath = INFILE
+    outPath = OUTFILE
+
+    # Step 1: Read input, write BED for liftOver, and store original rows
+    print("Reading input and creating BED for liftOver...", file=sys.stderr)
+    rows = []  # list of (chrom, pos, id, ref, alt, ac, an, af, ac_ex, an_ex, af_ex, aa)
+    bedTmp = tempfile.NamedTemporaryFile(mode="w", suffix=".bed", delete=False)
+    with gzip.open(inPath, "rt") as fh:
+        for line in fh:
+            if line.startswith("#"):
+                continue
+            fields = line.rstrip("\n").split("\t")
+            chrom, pos = fields[0], int(fields[1])
+            chromStr = "chr" + chrom
+            # BED is 0-based half-open; VCF POS is 1-based
+            bedTmp.write(f"{chromStr}\t{pos - 1}\t{pos}\t{len(rows)}\n")
+            rows.append(fields)
+    bedTmp.close()
+    print(f"  {len(rows)} variants read.", file=sys.stderr)
+
+    # Step 2: Run liftOver
+    liftedBed = bedTmp.name + ".lifted"
+    unmappedBed = bedTmp.name + ".unmapped"
+    print("Running liftOver...", file=sys.stderr)
+    subprocess.check_call([
+        LIFTOVER, "-minMatch=" + MIN_MATCH,
+        bedTmp.name, CHAIN, liftedBed, unmappedBed
+    ])
+
+    # Count unmapped
+    unmappedCount = 0
+    with open(unmappedBed) as fh:
+        for line in fh:
+            if not line.startswith("#"):
+                unmappedCount += 1
+    print(f"  {unmappedCount} variants unmapped.", file=sys.stderr)
+
+    # Step 3: Read lifted coordinates. Key = original row index -> (chrom, pos)
+    lifted = {}
+    with open(liftedBed) as fh:
+        for line in fh:
+            fields = line.rstrip("\n").split("\t")
+            chrom = fields[0]
+            pos = int(fields[2])  # 1-based end = our VCF POS
+            idx = int(fields[3])
+            lifted[idx] = (chrom, pos)
+    print(f"  {len(lifted)} variants lifted to hg38.", file=sys.stderr)
+
+    # Step 4: Write VCF using only the "excluding 1000 Genomes" counts,
+    # since the varFreqs supertrack already includes 1000 Genomes data separately.
+    # Skip variants with AN_EXCLUDING_1000G == 0 (only seen in 1000G samples).
+    print("Writing VCF...", file=sys.stderr)
+    skipped1kg = 0
+    with open(outPath, "w") as out:
+        out.write("##fileformat=VCFv4.1\n")
+        out.write("##source=HRC.r1-1\n")
+        out.write("##liftOver=hg19ToHg38\n")
+        with open(CHROMSIZES) as cs:
+            for cline in cs:
+                chrom_name, chrom_len = cline.rstrip().split("\t")
+                out.write(f"##contig=<ID={chrom_name},length={chrom_len}>\n")
+        out.write('##INFO=<ID=AC,Number=A,Type=Integer,Description="Allele count (excluding 1000 Genomes samples)">\n')
+        out.write('##INFO=<ID=AN,Number=1,Type=Integer,Description="Total allele number (excluding 1000 Genomes samples)">\n')
+        out.write('##INFO=<ID=AF,Number=A,Type=Float,Description="Allele frequency (excluding 1000 Genomes samples)">\n')
+        out.write('##INFO=<ID=AA,Number=1,Type=String,Description="Ancestral allele">\n')
+        out.write("#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n")
+
+        written = 0
+        for idx in range(len(rows)):
+            if idx not in lifted:
+                continue
+            chrom, pos = lifted[idx]
+            fields = rows[idx]
+            ac = fields[8]   # AC_EXCLUDING_1000G
+            an = fields[9]   # AN_EXCLUDING_1000G
+            af = fields[10]  # AF_EXCLUDING_1000G
+
+            # Skip variants only present in 1000 Genomes samples
+            if an == "0" or (ac == "0" and af == "0"):
+                skipped1kg += 1
+                continue
+
+            rsId = fields[2] if fields[2] != "." else "."
+            ref = fields[3]
+            alt = fields[4]
+            aa = fields[11] if len(fields) > 11 else "."
+
+            info = f"AC={ac};AN={an};AF={af}"
+            if aa and aa != ".":
+                info += f";AA={aa}"
+
+            out.write(f"{chrom}\t{pos}\t{rsId}\t{ref}\t{alt}\t.\tPASS\t{info}\n")
+            written += 1
+
+    print(f"  {skipped1kg} variants skipped (only in 1000 Genomes).", file=sys.stderr)
+    print(f"  {written} variants written to {outPath}", file=sys.stderr)
+
+    # Cleanup temp files
+    os.unlink(bedTmp.name)
+    os.unlink(liftedBed)
+    os.unlink(unmappedBed)
+
+    print("Done.", file=sys.stderr)
+
+if __name__ == "__main__":
+    main()