395b531075423d60f7e9abe9edd0810b1a6fc0fc max Tue Jun 17 08:24:04 2025 -0700 finishing up mutscore and mcap tracks, refs #35806, #35922 diff --git src/hg/makeDb/scripts/mcap/mcapToBw.py src/hg/makeDb/scripts/mcap/mcapToBw.py new file mode 100644 index 00000000000..02f204c8c1e --- /dev/null +++ src/hg/makeDb/scripts/mcap/mcapToBw.py @@ -0,0 +1,51 @@ +import pandas as pd +import os +import subprocess + +# Inputs +tsv_file = "mcap_v1_4.txt.gz" +chrom_sizes_file = "hg19.chrom.sizes" # You must download this file from UCSC if not available + +# Output +output_dir = "bw" +os.makedirs(output_dir, exist_ok=True) + +# Load TSV +#df = pd.read_csv(tsv_file, sep="\t", comment="#") + +# Ensure column names are stripped + +df = pd.read_csv(tsv_file, sep="\t", comment=None) +df.columns = df.columns.str.strip().str.lstrip("#") # remove leading '#' from column names + +# Normalize chromosome names (e.g., add "chr" prefix if needed) +df['grch37_chrom'] = df['grch37_chrom'].astype(str) +df['chrom'] = df['grch37_chrom'].apply(lambda x: f"chr{x}" if not x.startswith("chr") else x) + +# Add start and end positions (bedGraph format: zero-based start, one-based end) +df['start'] = df['pos'] - 1 +df['end'] = df['pos'] + +# Write bedGraph and convert to bigWig for each alt nucleotide +for alt_base in ['A', 'C', 'G', 'T']: + subset = df[df['alt'] == alt_base] + if subset.empty: + continue + + bedgraph_file = os.path.join(output_dir, f"{alt_base}.bedGraph") + bigwig_file = os.path.join(output_dir, f"{alt_base}.bw") + + subset[['chrom', 'start', 'end', 'mcapv1.4']].to_csv( + bedgraph_file, sep='\t', header=False, index=False + ) + + # Convert to bigWig + subprocess.run([ + "bedGraphToBigWig", + bedgraph_file, + chrom_sizes_file, + bigwig_file + ], check=True) + + print(f"Created: {bigwig_file}") +