af06f485943b58470dccd7c832b97042f0c8ddb8 gperez2 Sun Jan 18 21:44:31 2026 -0800 Adding process_hgmd.py script, updating makedoc, and updating the hgmd track, refs #36779 diff --git src/hg/makeDb/scripts/hgmd/process_hgmd.py src/hg/makeDb/scripts/hgmd/process_hgmd.py new file mode 100755 index 00000000000..5b094d7fce8 --- /dev/null +++ src/hg/makeDb/scripts/hgmd/process_hgmd.py @@ -0,0 +1,423 @@ +#!/usr/bin/env python3 +""" +This script processes HGMD data for hg38/hg19 and year. +It performs the following: + +1. Converts HGMD TSV files to BED format with variant classifications +2. Creates BigBed files +3. Creates symlinks and database links for the BigBed files +4. Extracts transcript IDs from HGMD data +5. Filters curated gene predictions from ncbiRefSeq +6. Loads gene predictions into the database +""" + +import argparse +import subprocess +import sys +import os +import glob + +def run_command(cmd, description): + """ + Run a shell command and handle errors + + Args: + cmd (str): Shell command to execute + description (str): Human-readable description of the command + + Returns: + bool: True if command succeeded, False otherwise + """ + result = subprocess.run( + cmd, + shell=True, + executable='/bin/bash', + capture_output=True, + text=True + ) + + if result.returncode != 0: + print("Error: {} failed".format(description), file=sys.stderr) + print("stderr: {}".format(result.stderr), file=sys.stderr) + return False + + return True + +def check_input_file_exists(year, db): + """ + Check if the HGMD input file exists + + Args: + year (str): Year of HGMD data file + db (str): Database/genome build (hg38 or hg19) + + Returns: + bool: True if file exists, False otherwise + """ + input_file = "/hive/data/outside/hgmd/{}.4-hgmd-public_{}.tsv".format(year, db) + if not os.path.isfile(input_file): + print("Error: Input file {} does not exist".format(input_file), file=sys.stderr) + return False + return True + +def find_latest_ncbirefseq_dir(db, year): + """ + Find the latest ncbiRefSeq directory for the given year + + Searches for directories matching the pattern: + - hg38: /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.{year}-* + - hg19: /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.{year}-* + + If no directories match the given year, tries previous years (up to 5 years back) + + Args: + db (str): Database/genome build (hg38 or hg19) + year (str): Year to search for + + Returns: + str: Path to the latest matching directory, or None if not found + """ + # Determine ncbiRefSeq version based on database + # hg19 uses p13, hg38 uses p14 + version = "p13" if db == "hg19" else "p14" + + base_path = "/hive/data/genomes/{}/bed".format(db) + + # Try the provided year first, then fall back to previous years + current_year = int(year) + max_attempts = 5 # Try up to 5 years back + + for attempt in range(max_attempts): + pattern = os.path.join(base_path, "ncbiRefSeq.{}.{}-*".format(version, current_year)) + matching_dirs = glob.glob(pattern) + + if matching_dirs: + # Found matching directories, return the latest + latest_dir = sorted(matching_dirs)[-1] + if current_year != int(year): + print("Note: Using {} directory (year {} not found)".format( + os.path.basename(latest_dir), year)) + return latest_dir + + # No match, try previous year + current_year -= 1 + + # No directories found after trying multiple years + print("Error: No directories matching pattern ncbiRefSeq.{}.* found (tried years {} to {})".format( + version, year, int(year) - max_attempts + 1), file=sys.stderr) + return None + +def process_database(year, db, working_dir): + """ + Process HGMD data for a single database + + This function performs the following steps: + 1. Reads HGMD TSV file and creates BED file with variant annotations + 2. Converts BED to BigBed format + 3. Creates symlink in /gbdb/{db}/bbi/ + 4. Runs hgBbiDbLink to register the BigBed in the database + + Args: + year (str): Year of HGMD data file + db (str): Database/genome build (hg38 or hg19) + working_dir (str): Working directory path + + Returns: + bool: True if all steps succeeded, False otherwise + """ + + # ======================================================================== + # Step 1: Process TSV file and create BED file + # ======================================================================== + # Read HGMD TSV, filter comments, classify variants, and sort by position + # Variant types: I=insertion, D=deletion, M=substitution, X=indel, R=regulatory, S=splicing + bed_file = "hgmd.bed" + process_cmd = """cat /hive/data/outside/hgmd/{}.4-hgmd-public_{}.tsv | \ +grep -v \\# | \ +tawk '{{if ($5=="I") {{start=$4-1; end=$4+1; col="100,100,100"}} else if ($5=="D") {{start=$4-1; end=$4; col="170,170,170"}} else {{start=$4-1; end=$4; col="0,0,0"}}; print "chr"$3,start,end,$2":"$1,0,".",start,end,col,$2,$1,$5}}' | \ +sed -e 's/M$/substitution/' | \ +sed -e 's/I$/insertion (between the two basepairs, sequence not provided by HGMD)/' | \ +sed -e 's/D$/deletion (endpoint not provided by HGMD)/' | \ +sed -e 's/X$/insertion-deletion (endpoint not provided by HGMD)/' | \ +sed -e 's/R$/regulatory variant/' | \ +sed -e 's/S$/splicing variant/' | \ +sort -k1,1 -k2,2n > {}""".format(year, db, bed_file) + + if not run_command(process_cmd, "Processing TSV and creating BED file for {}".format(db)): + return False + + # ======================================================================== + # Step 2: Convert BED to BigBed + # ======================================================================== + bb_file = "hgmd.bb" + bigbed_cmd = """bedToBigBed {} /hive/data/genomes/{}/chrom.sizes {} \ +-type=bed9+ \ +-as=/hive/data/genomes/{}/bed/hgmd/hgmd.as \ +-tab""".format(bed_file, db, bb_file, db) + + if not run_command(bigbed_cmd, "Converting BED to BigBed for {}".format(db)): + return False + + # ======================================================================== + # Step 3: Create symlink + # ======================================================================== + # Create symlink from /gbdb/{db}/bbi/hgmd.bb to the actual BigBed file + symlink_dir = "/gbdb/{}/bbi".format(db) + symlink_path = os.path.join(symlink_dir, "hgmd.bb") + bb_path = os.path.join(working_dir, bb_file) + + # Create symlink directory if it doesn't exist + os.makedirs(symlink_dir, exist_ok=True) + + # Remove existing symlink if it exists (for updates) + if os.path.islink(symlink_path) or os.path.exists(symlink_path): + os.remove(symlink_path) + + # Create new symlink + symlink_cmd = "ln -s {} {}".format(bb_path, symlink_path) + if not run_command(symlink_cmd, "Creating symlink for {}".format(db)): + return False + + # ======================================================================== + # Step 4: Run hgBbiDbLink + # ======================================================================== + hgbbidblink_cmd = "hgBbiDbLink {} hgmd {}".format(db, symlink_path) + if not run_command(hgbbidblink_cmd, "Running hgBbiDbLink for {}".format(db)): + return False + + # Output summary + bed_path = os.path.join(working_dir, bed_file) + print("✓ {} BigBed completed successfully!".format(db)) + print("Output files: {}, {}".format(bed_path, bb_path)) + print("Symlink created: {}".format(symlink_path)) + print("hgBbiDbLink run: hgBbiDbLink {} hgmd {}".format(db, symlink_path)) + + return True + +def process_transcripts(year, db, ncbirefseq_source_dir, output_dir): + """ + Extract HGMD transcripts and filter gene predictions + + NOTE: Always uses hg38 HGMD file for transcript extraction because + only the hg38 file contains transcript IDs (column 7). The hg19 file + only has 6 columns without transcript information. + + This function performs the following steps: + 1. Creates output directory matching the ncbiRefSeq date + 2. Extracts unique transcript IDs from HGMD hg38 data + 3. Filters gene predictions to only include HGMD transcripts + 4. Loads filtered gene predictions into the database + + Args: + year (str): Year of HGMD data file + db (str): Database/genome build (hg38 or hg19) + ncbirefseq_source_dir (str): Path to source ncbiRefSeq directory + output_dir (str): Base output directory path + + Returns: + bool: True if all steps succeeded, False otherwise + """ + + # ======================================================================== + # Extract date and create output directory + # ======================================================================== + # Extract the date suffix and version from the ncbirefseq source directory + # Example: /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.2025-08-13 + # -> version="p14", date_suffix="2025-08-13" + dir_basename = os.path.basename(ncbirefseq_source_dir) + + # Determine version (p13 or p14) + if dir_basename.startswith("ncbiRefSeq.p13."): + version = "p13" + date_suffix = dir_basename.replace("ncbiRefSeq.p13.", "") + else: + version = "p14" + date_suffix = dir_basename.replace("ncbiRefSeq.p14.", "") + + # Create output directory with same version and date pattern + # Example: /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/ + transcript_output_dir = os.path.join(output_dir, "ncbiRefSeq.{}.{}".format(version, date_suffix)) + os.makedirs(transcript_output_dir, exist_ok=True) + + # Change to output directory for file creation + os.chdir(transcript_output_dir) + + # ======================================================================== + # Step 1: Extract transcript IDs from HGMD + # ======================================================================== + # IMPORTANT: Always use hg38 file because it has transcript IDs in column 7 + # The hg19 file only has 6 columns without transcript information + # + # hg38 file format (7 columns): + # CM1720767 A1CF 10 50814012 M GRCh38 NM_001198819.2 + # + # hg19 file format (6 columns): + # CM1720767 A1CF 10 52573772 M GRCh37 + # + # Extract column 7 (transcript IDs), remove version numbers (.1, .2, etc), + # get unique IDs, and add back the dot for matching + transcript_file = "hgmdTranscripts.txt" + extract_cmd = """cat /hive/data/outside/hgmd/{}.4-hgmd-public_hg38.tsv | \ +cut -f7 | \ +cut -d. -f1 | \ +sort -u | \ +awk '{{print $1"."}}' > {}""".format(year, transcript_file) + + if not run_command(extract_cmd, "Extracting HGMD transcripts for {}".format(db)): + return False + + # ======================================================================== + # Step 2: Filter gene predictions + # ======================================================================== + # Extract only the gene predictions for transcripts found in HGMD + # This creates a subset of ncbiRefSeq focused on disease-related genes + output_file = "hgmd.curated.gp" + gp_path = "/hive/data/genomes/{}/bed/ncbiRefSeq.{}.{}/process/{}.curated.gp.gz".format( + db, version, date_suffix, db) + + filter_cmd = """zcat {} | fgrep -f {} - > {}""".format( + gp_path, transcript_file, output_file) + + if not run_command(filter_cmd, "Filtering gene predictions for {}".format(db)): + return False + + # ======================================================================== + # Step 3: Load gene predictions into database + # ======================================================================== + # Load the filtered gene predictions into a database table + # Table name: ncbiRefSeqHgmd + # Format: genePredExt (extended gene prediction format) + hgloadgenepred_cmd = "hgLoadGenePred -genePredExt {} ncbiRefSeqHgmd {}".format(db, output_file) + if not run_command(hgloadgenepred_cmd, "Loading gene predictions for {}".format(db)): + return False + + # Output summary + transcript_path = os.path.join(transcript_output_dir, transcript_file) + gp_output_path = os.path.join(transcript_output_dir, output_file) + print("✓ {} transcript processing completed!".format(db)) + print("Output files: {}, {}".format(transcript_path, gp_output_path)) + print("hgLoadGenePred run: hgLoadGenePred -genePredExt {} ncbiRefSeqHgmd {}".format(db, output_file)) + + return True + +def main(): + """ + Main function to orchestrate HGMD data processing + + Processing flow: + 1. Parse command-line arguments + 2. Validate input files exist + 3. Process BigBed files for each database + 4. Find latest ncbiRefSeq directories + 5. Process transcript data for each database + """ + + # ======================================================================== + # Parse command-line arguments + # ======================================================================== + parser = argparse.ArgumentParser( + description='Process HGMD data and convert to BigBed format' + ) + parser.add_argument( + '--year', + required=True, + help='Year for HGMD data file (e.g., 2024)' + ) + parser.add_argument( + '--db', + required=True, + nargs='+', + choices=['hg38', 'hg19'], + help='Database/genome build (hg38 and/or hg19, can specify multiple)' + ) + + args = parser.parse_args() + year = args.year + databases = args.db + + # ======================================================================== + # Validate input files + # ======================================================================== + # Check that HGMD input files exist before starting any processing + for db in databases: + if not check_input_file_exists(year, db): + sys.exit(1) + + # ======================================================================== + # Process BigBed files for each database + # ======================================================================== + # Create BED files, convert to BigBed, create symlinks, and register in database + success_count = 0 + for db in databases: + # Create and change to database-specific working directory + # Directory structure: /hive/data/genomes/{db}/bed/hgmd/ + working_dir = "/hive/data/genomes/{}/bed/hgmd".format(db) + os.makedirs(working_dir, exist_ok=True) + + try: + os.chdir(working_dir) + except FileNotFoundError: + print("Error: Directory {} not found".format(working_dir), file=sys.stderr) + sys.exit(1) + + if process_database(year, db, working_dir): + success_count += 1 + else: + print("\n✗ Failed to process BigBed for {}".format(db), file=sys.stderr) + + # Exit if any BigBed processing failed + if success_count < len(databases): + print("\nSome BigBed processing failed. Exiting.", file=sys.stderr) + sys.exit(1) + + # ======================================================================== + # Process transcripts for each database + # ======================================================================== + # Extract HGMD transcripts and filter gene predictions from ncbiRefSeq + transcript_success_count = 0 + for db in databases: + # Find the latest ncbiRefSeq directory for this database + # hg38: /hive/data/genomes/hg38/bed/ncbiRefSeq.p14.{year}-{date}/ + # hg19: /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.{year}-{date}/ + # Falls back to previous years if specified year not found + ncbirefseq_source_dir = find_latest_ncbirefseq_dir(db, year) + if not ncbirefseq_source_dir: + print("\n✗ Failed to find ncbiRefSeq directory for {}".format(db), file=sys.stderr) + continue + + # Output to hgmd directory + # Files will be created in: /hive/data/genomes/{db}/bed/hgmd/ncbiRefSeq.p{13|14}.{date}/ + output_dir = "/hive/data/genomes/{}/bed/hgmd".format(db) + + if process_transcripts(year, db, ncbirefseq_source_dir, output_dir): + transcript_success_count += 1 + else: + print("\n✗ Failed to process transcripts for {}".format(db), file=sys.stderr) + + # Exit if any transcript processing failed + if transcript_success_count < len(databases): + sys.exit(1) + +if __name__ == '__main__': + main() + +# Sample output: + +# python3 process_hgmd.py --year 2025 --db hg38 +# hg38 BigBed completed successfully! +# Output files: /hive/data/genomes/hg38/bed/hgmd/hgmd.bed, /hive/data/genomes/hg38/bed/hgmd/hgmd.bb +# Symlink created: /gbdb/hg38/bbi/hgmd.bb +# hgBbiDbLink run: hgBbiDbLink hg38 hgmd /gbdb/hg38/bbi/hgmd.bb +# ✓ hg38 transcript processing completed! +# Output files: /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/hgmdTranscripts.txt, /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/hgmd.curated.gp +# hgLoadGenePred run: hgLoadGenePred -genePredExt hg38 ncbiRefSeqHgmd hgmd.curated.gp + +# python3 process_hgmd.py --year 2025 --db hg19 +# Note: Using ncbiRefSeq.p13.2024-12-15 directory (year 2025 not found) +# hg19 BigBed completed successfully! +# Output files: /hive/data/genomes/hg19/bed/hgmd/hgmd.bed, /hive/data/genomes/hg19/bed/hgmd/hgmd.bb +# Symlink created: /gbdb/hg19/bbi/hgmd.bb +# hgBbiDbLink run: hgBbiDbLink hg19 hgmd /gbdb/hg19/bbi/hgmd.bb +# ✓ hg19 transcript processing completed! +# Output files: /hive/data/genomes/hg19/bed/hgmd/ncbiRefSeq.p13.2024-12-15/hgmdTranscripts.txt, /hive/data/genomes/hg19/bed/hgmd/ncbiRefSeq.p13.2024-12-15/hgmd.curated.gp +# hgLoadGenePred run: hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp