src/hg/makeDb/doc/hg38/hgmd.txt af06f485943b58470dccd7c832b97042f0c8ddb8

af06f485943b58470dccd7c832b97042f0c8ddb8
gperez2
  Sun Jan 18 21:44:31 2026 -0800
Adding process_hgmd.py script, updating makedoc, and updating the hgmd track, refs #36779

diff --git src/hg/makeDb/doc/hg38/hgmd.txt src/hg/makeDb/doc/hg38/hgmd.txt
index 9e35fc6d209..4f110d0df28 100644
--- src/hg/makeDb/doc/hg38/hgmd.txt
+++ src/hg/makeDb/doc/hg38/hgmd.txt
@@ -1,20 +1,57 @@
 # got hgmd 2017 from Frank Schacherer Frank.Schacherer@qiagen.com and Rupert Yip Rupert.Yip@qiagen.com
 # update 2019 Max
 # update 2020 Max
 # update 2021 Max
 # update 2023 Max
 # update 2024 Max
 cd /hive/data/genomes/hg38/bed/hgmd
 year=2024
 cat /hive/data/outside/hgmd/$year.4-hgmd-public_hg38.tsv | grep -v \# | tawk '{if ($5=="I") {start=$4-1; end=$4+1; col="100,100,100"} else if ($5=="D") {start=$4-1; end=$4; col="170,170,170"} else {start=$4-1; end=$4; col="0,0,0"}; print "chr"$3,start,end,$2":"$1,0,".",start,end,col,$2,$1,$5}' | sed -e 's/M$/substitution/' | sed -e 's/I$/insertion (between the two basepairs, sequence not provided by HGMD)/' | sed -e 's/D$/deletion (endpoint not provided by HGMD)/' | sed -e 's/X$/insertion-deletion (endpoint not provided by HGMD)/' | sed -e 's/R$/regulatory variant/' | sed -e 's/S$/splicing variant/' | sort -k1,1 -k2,2n > hgmd.bed
 # wc -l says:
 # for 2021: 210260 hgmd.bed
 # 2023: 261922 hgmd.bed
 # 2024: 301458 hgmd.bed
 bedToBigBed hgmd.bed /hive/data/genomes/hg38/chrom.sizes hgmd.bb -type=bed9+ -as=hgmd.as -tab 
 ln -s /hive/data/genomes/hg38/bed/hgmd/hgmd.bb /gbdb/hg38/bbi/hgmd.bb
 hgBbiDbLink hg38 hgmd /gbdb/hg38/bbi/hgmd.bb
 # also update: the hg19/hg38 RefSeq HGMD subtracks
 # also update: hgmd on hg19, hgBeacon, the hgmd file for GBIB, see hg19.txt
 # for this, jump now to hg38/ncbiRefSeq.txt and hg19.txt
 
+# HGMD 2025.4 release files from Elias.Hage@qiagen.com (updated 01/17/25 Gerardo)
+# Made a script using claude.ai that automates HGMD data processing for hg38 and hg19.
+
+# Location: ~/kent/src/hg/makeDb/scripts/hgmd/process_hgmd.py
+
+# What the script does:
+# 1. Creates BED files from HGMD TSV data with variant classifications
+# 2. Converts BED to BigBed format
+# 3. Creates symlinks in /gbdb/{db}/bbi/
+# 4. Registers BigBed files with hgBbiDbLink
+# 5. Extracts transcript IDs from hg38 HGMD file (column 7)
+# 6. Filters ncbiRefSeq gene predictions to HGMD transcripts only
+# 7. Loads filtered gene predictions into ncbiRefSeqHgmd table
+#
+# Key features:
+# - Always uses hg38 file for transcript extraction (hg19 file lacks column 7)
+# - Auto-detects ncbiRefSeq version: p13 for hg19, p14 for hg38
+# - Falls back to previous years if specified year's ncbiRefSeq not found
+
+# wc -l:
+# 332094 /hive/data/genomes/hg38/bed/hgmd/hgmd.bed
+
+# wc -l:
+# 15691 /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/hgmd.curated.gp
+
+# Usage:
+python3 ~/kent/src/hg/makeDb/scripts/hgmd/process_hgmd.py --year 2025 --db hg38
+
+# Sample output:
+#   hg38 BigBed completed successfully!
+# Output files: /hive/data/genomes/hg38/bed/hgmd/hgmd.bed, /hive/data/genomes/hg38/bed/hgmd/hgmd.bb
+# Symlink created: /gbdb/hg38/bbi/hgmd.bb
+# hgBbiDbLink run: hgBbiDbLink hg38 hgmd /gbdb/hg38/bbi/hgmd.bb
+#   hg38 transcript processing completed!
+# Output files: /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/hgmdTranscripts.txt, /hive/data/genomes/hg38/bed/hgmd/ncbiRefSeq.p14.2025-08-13/hgmd.curated.gp
+# hgLoadGenePred run: hgLoadGenePred -genePredExt hg38 ncbiRefSeqHgmd hgmd.curated.gp
+#############################################################################