src/hg/makeDb/doc/hg19.txt af06f485943b58470dccd7c832b97042f0c8ddb8

af06f485943b58470dccd7c832b97042f0c8ddb8
gperez2
  Sun Jan 18 21:44:31 2026 -0800
Adding process_hgmd.py script, updating makedoc, and updating the hgmd track, refs #36779

diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt
index acba2869c66..15a4f9ac5e2 100644
--- src/hg/makeDb/doc/hg19.txt
+++ src/hg/makeDb/doc/hg19.txt
@@ -32218,34 +32218,73 @@
 exit
 # next restrict RefSeq down to HGMD subset
 
 # addition of HGMD-restricted subset, Max, Jan 29 2019, updated Dec 10 2019, again Aug 2023, May 2024, Dec 2024
 cd /hive/data/genomes/hg19/bed/ncbiRefSeq.p13.2024-09-18/
 # change in 2019: ignore the version numbers, otherwise only 1815 transcripts left, big update by HGMD in 2019
 # adding "." so NM_123 doesn't match NM_123123
 cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | cut -f7 | cut -d. -f1 | sort -u | awk '{print $1"."}' > hgmdTranscripts.txt
 zcat process/hg19.curated.gp.gz | fgrep -f hgmdTranscripts.txt - > hgmd.curated.gp
 hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp
 $ wc -l hgmd.curated.gp
 7965 hgmd.curated.gp in 2019
 8971 hgmd.curated.gp in 2020
 10451 hgmd.curated.gp in 2021
 14693 hgmd.curated.gp in 2024
+15209 hgmd.curated.gp in 2025
 
 # now continue the process at ../hg38/hgmd.txt
 # or ideally make a shell script for all this...
 
+# HGMD 2025.4 release files from Elias.Hage@qiagen.com (updated 01/17/25 Gerardo)
+# Made a script using claude.ai that automates HGMD data processing for hg38 and hg19.
+
+# Location: ~/kent/src/hg/makeDb/scripts/hgmd/process_hgmd.py
+
+# What the script does:
+# 1. Creates BED files from HGMD TSV data with variant classifications
+# 2. Converts BED to BigBed format
+# 3. Creates symlinks in /gbdb/{db}/bbi/
+# 4. Registers BigBed files with hgBbiDbLink
+# 5. Extracts transcript IDs from hg38 HGMD file (column 7)
+# 6. Filters ncbiRefSeq gene predictions to HGMD transcripts only
+# 7. Loads filtered gene predictions into ncbiRefSeqHgmd table
+#
+# Key features:
+# - Always uses hg38 file for transcript extraction (hg19 file lacks column 7)
+# - Auto-detects ncbiRefSeq version: p13 for hg19, p14 for hg38
+# - Falls back to previous years if specified year's ncbiRefSeq not found
+
+# wc -l:
+331959 /hive/data/genomes/hg19/bed/hgmd/hgmd.bed
+
+# wc -l:
+15209 /hive/data/genomes/hg19/bed/hgmd/ncbiRefSeq.p13.2024-09-18/hgmd.curated.gp
+
+# Usage:
+python3 ~/kent/src/hg/makeDb/scripts/hgmd/process_hgmd.py --year 2025 --db hg19
+
+# Sample output:
+#    hg19 BigBed completed successfully!
+# Output files: /hive/data/genomes/hg19/bed/hgmd/hgmd.bed, /hive/data/genomes/hg19/bed/hgmd/hgmd.bb
+# Symlink created: /gbdb/hg19/bbi/hgmd.bb
+# hgBbiDbLink run: hgBbiDbLink hg19 hgmd /gbdb/hg19/bbi/hgmd.bb
+# Note: Using ncbiRefSeq.p13.2024-09-18 directory (year 2025 not found)
+#    hg19 transcript processing completed!
+# Output files: /hive/data/genomes/hg19/bed/hgmd/ncbiRefSeq.p13.2024-09-18/hgmdTranscripts.txt, /hive/data/genomes/hg19/bed/hgmd/ncbiRefSeq.p13.2024-09-18/hgmd.curated.gp
+# hgLoadGenePred run: hgLoadGenePred -genePredExt hg19 ncbiRefSeqHgmd hgmd.curated.gp
+
 #############################################################################
 # LASTZ human/hg19 vs. pig/susScr11 - (DONE - 2018-04-02 - Hiram)
     mkdir /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02
     cd /hive/data/genomes/hg19/bed/lastzSusScr11.2018-04-02
 
     printf '# human vs pig
 BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz
 BLASTZ_O=400
 BLASTZ_E=30
 BLASTZ_M=254
 # default BLASTZ_Q score matrix:
 #       A     C     G     T
 # A    91  -114   -31  -123
 # C  -114   100  -125   -31
 # G   -31  -125   100  -114