1525618a814bcf6eb64c4147b99c520509a55d60 max Tue May 19 07:37:29 2026 -0700 Refresh mpraVarDB to 2026-05-19 snapshot; fix hg19 pos in name field. Upstream landed the Mouri/Tewhey pvalue correction (5,092 rows with pvalue > 1 -> 0) and replaced 47,156 placeholder fdr=1.0 values with NaN. Rebuild from the new CSV; remove the temporary "pending upstream fix" note in mpraVarDb.html. Also closes the pre-existing "hg19 pos in non-rs name field" issue: mpravardbToBed.py gained a post-liftOver step that rewrites the chr:pos prefix inside non-rs names with the hg38 coordinates. 47,160 names were rewritten in this build (e.g. chr1:1403972:C>CG at hg38 chr1:1468591 now reads chr1:1468592:C>CG). itemCount preserved at 239,028. refs #37359 Co-Authored-By: Claude Opus 4.7 (1M context) diff --git src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py index 984367f26cf..f8fb6f9aac9 100644 --- src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py +++ src/hg/makeDb/scripts/mpravardb/mpravardbToBed.py @@ -193,49 +193,76 @@ elif genome == "hg38": f38.write(line) hg38_count += 1 print(f"Wrote {hg19_count} hg19 rows to {hg19_bed}") print(f"Wrote {hg38_count} hg38 rows to {hg38_bed}") def run(cmd): """Run a shell command, exit on failure.""" print(f" Running: {cmd}") result = subprocess.run(cmd, shell=True) if result.returncode != 0: print(f"ERROR: command failed with exit code {result.returncode}", file=sys.stderr) sys.exit(1) +# Non-rs names are built in csv_to_bed() as "chr::>" +# from the raw CSV pos, which for hg19 rows is the hg19 position. liftOver +# updates the BED coordinates but leaves the name string alone, so the name +# carries a stale hg19 pos. Rewrite the chrom:pos prefix from the row's +# (already hg38-coordinate) col 1 + col 2 so the name agrees with the row's +# coordinates. +_NAME_PREFIX_RE = re.compile(r"^chr[^:]+:\d+:") + +def fix_lifted_names(lifted_bed): + fixed = 0 + tmp = lifted_bed + ".tmp" + with open(lifted_bed) as fin, open(tmp, "w") as fout: + for line in fin: + cols = line.rstrip("\n").split("\t") + chrom, start, name = cols[0], int(cols[1]), cols[3] + if _NAME_PREFIX_RE.match(name): + new_name = _NAME_PREFIX_RE.sub(f"{chrom}:{start+1}:", name) + if new_name != name: + cols[3] = new_name + fixed += 1 + fout.write("\t".join(cols) + "\n") + os.replace(tmp, lifted_bed) + print(f" Rewrote hg19 -> hg38 pos in {fixed} non-rs name fields") + def main(): hg19_bed = "mpravardb.hg19.bed" hg38_bed = "mpravardb.hg38.bed" lifted_bed = "mpravardb.hg19lifted.bed" unmapped_bed = "mpravardb.unmapped.bed" merged_bed = "mpravardb.bed" output_bb = "mpravardb.bb" print("Step 1: Converting CSV to BED format...") csv_to_bed(INPUT_CSV, hg19_bed, hg38_bed) print("\nStep 2: Lifting hg19 coordinates to hg38...") run(f"liftOver -bedPlus=9 -tab {hg19_bed} {LIFTOVER_CHAIN} {lifted_bed} {unmapped_bed}") # Count unmapped unmapped = sum(1 for line in open(unmapped_bed) if not line.startswith("#")) lifted = sum(1 for _ in open(lifted_bed)) print(f" Lifted: {lifted}, Unmapped: {unmapped}") + # Rewrite chr:pos inside non-rs name fields with the lifted hg38 pos. + fix_lifted_names(lifted_bed) + print("\nStep 3: Merging hg38 native + lifted hg19...") run(f"cat {hg38_bed} {lifted_bed} > {merged_bed}.tmp") run(f"bedSort {merged_bed}.tmp {merged_bed}") os.remove(f"{merged_bed}.tmp") total = sum(1 for _ in open(merged_bed)) print(f" Total merged rows: {total}") print("\nStep 4: Converting to bigBed...") run(f"bedToBigBed -type=bed9+ -tab -as={AS_FILE} {merged_bed} {CHROM_SIZES} {output_bb}") print(f"\nDone. Output: {output_bb}") print(f"File size: {os.path.getsize(output_bb) / 1024 / 1024:.1f} MB") if __name__ == "__main__":