ed3cb243b9151cae7d1859712148fa1d1e1a2196
lrnassar
  Mon Apr 13 17:34:17 2026 -0700
Adding makedoc and build scripts for InSiGHT VCEP track hub. refs #36582

diff --git src/hg/makeDb/doc/InSiGHT.txt src/hg/makeDb/doc/InSiGHT.txt
new file mode 100644
index 00000000000..5e826d5b3bf
--- /dev/null
+++ src/hg/makeDb/doc/InSiGHT.txt
@@ -0,0 +1,156 @@
+#RM#36582
+
+# InSiGHT VCEP Track Hub
+# International Society for Gastrointestinal Hereditary Tumours (InSiGHT)
+# Variant Curation Expert Panel (VCEP)
+# Lynch syndrome mismatch repair genes: MLH1, MSH2, MSH6, PMS2
+# CSpec v2.0.0
+# Assemblies: hg38 and hg19
+
+# Working directory for all track data
+mkdir -p /hive/users/lrnassar/insightHub
+
+# Build scripts are located here:
+~/kent/src/hg/makeDb/scripts/insight/
+
+# Quick link for github:
+# https://github.com/ucscGenomeBrowser/kent/tree/master/src/hg/makeDb/scripts/insight
+
+# Hub structure:
+#   /hive/users/lrnassar/insightHub/
+#     hub.txt, genomes.txt
+#     hg38/trackDb.txt, hg19/trackDb.txt
+#     insight.html (shared description page)
+#     clinDomains/   - Clinical Domains track data
+#     pvs1/          - PVS1 Regions track data
+#     afFrequencies/ - Allele Frequencies track data
+#     hciPriors/     - HCI Priors track data
+#     functionalAssays/ - Functional Assays track data
+#     lovdVars/      - InSiGHT Curated Variants track data
+
+# Canonical transcripts used across all tracks:
+#   MLH1: NM_000249.4 (chr3, + strand)
+#   MSH2: NM_000251.3 (chr2, + strand)
+#   MSH6: NM_000179.3 (chr2, + strand)
+#   PMS2: NM_000535.7 (chr7, - strand)
+
+##############################################################################
+# Track 1: Clinical Domains (PM1)
+##############################################################################
+
+# Clinically relevant protein domains for the 4 MMR genes.
+# Domain definitions are hardcoded in the script from the InSiGHT VCEP specs.
+# Generates bigBed 9+4 files for hg38 and hg19.
+
+cd /hive/users/lrnassar/insightHub/clinDomains
+python3 ~/kent/src/hg/makeDb/scripts/insight/insightClinDomains.py
+
+# Output: InSiGHTclinDomainsHg38.bb, InSiGHTclinDomainsHg19.bb
+
+##############################################################################
+# Track 2: PVS1 Regions
+##############################################################################
+
+# PVS1 decision tree regions based on NMD predictions and critical functional
+# regions. Gene-specific codon boundaries from the InSiGHT VCEP specs:
+#   MLH1: NMD <=684, CritRegion 685-753, FuncUnknown 754-756, n.a. >756
+#   MSH2: NMD <=861, CritRegion 862-891, FuncUnknown 892-934, n.a. >934
+#   MSH6: NMD <=1317, CritRegion 1318-1341, FuncUnknown 1342-1360, n.a. >1360
+#   PMS2: NMD <=798, n.a. >798
+# Generates bigBed 9+3 files for hg38 and hg19.
+
+cd /hive/users/lrnassar/insightHub/pvs1
+python3 ~/kent/src/hg/makeDb/scripts/insight/insightPVS1.py
+
+# Output: InSiGHTPVS1Hg38.bb, InSiGHTPVS1Hg19.bb
+
+##############################################################################
+# Track 3: Allele Frequencies (BA1/BS1/PM2)
+##############################################################################
+
+# ACMG allele frequency classifications from gnomAD v4.1 exomes.
+# Gene-specific thresholds from the InSiGHT VCEP specs.
+# Requires access to gnomAD v4.1 bigBed files in /gbdb/hg38/gnomAD/v4.1/exomes/
+# Generates bigBed 9+3 files for hg38 and hg19 (hg19 via liftOver).
+
+cd /hive/users/lrnassar/insightHub/afFrequencies
+python3 ~/kent/src/hg/makeDb/scripts/insight/insightAFfrequencies.py
+
+# Output: InSiGHTAFHg38.bb, InSiGHTAFHg19.bb
+
+##############################################################################
+# Track 4: HCI Priors (PP3/BP4)
+##############################################################################
+
+# HCI prior probability predictions for missense variants.
+# Source data: LOVD database exports (tab-delimited files downloaded manually
+# from the LOVD shared database for each gene's priors table).
+# Requires LOVD priors files in the hciPriors/ directory:
+#   LOVD_MLH1_priors_*.txt
+#   LOVD_MSH2_priors_*.txt
+#   LOVD_MSH6_priors_*.txt
+#   LOVD_PMS2_priors_*.txt
+# Thresholds: PP3_moderate >0.81, PP3_supporting 0.68-0.81, BP4_supporting <0.11
+# Generates bigBed 9+5 files for hg38 and hg19.
+
+cd /hive/users/lrnassar/insightHub/hciPriors
+python3 ~/kent/src/hg/makeDb/scripts/insight/insightHCIPriors.py
+
+# Output: InSiGHTHCIPriorsHg38.bb, InSiGHTHCIPriorsHg19.bb
+
+##############################################################################
+# Track 5: Functional Assays (PS3/BS3)
+##############################################################################
+
+# Functional assay evidence from 4 publications:
+#   Drost et al. 2018 (PMID:30504929) - 74 MLH1/MSH2 variants, CIMRA assay
+#   Drost et al. 2020 (PMID:31965077) - 87 MSH6 variants, CIMRA assay
+#   Jia et al. 2021 (PMID:33357406)  - 16,749 MSH2 variants, deep mutational scan
+#   Rath et al. 2022 (PMID:36054288) - 26 MLH1 variants, cell-based assay
+#
+# Requires supplementary data files in the functionalAssays/ directory:
+#   drost2020_supplement.docx (Drost 2020 S1/S3/S5 tables)
+#   mmc2.xlsx (Jia 2021 TableS4/S5)
+#   (Drost 2018 and Rath 2022 data are hardcoded from their supplements)
+#
+# Also requires openpyxl: pip install openpyxl
+# Generates bigBed 9+7 files for hg38 and hg19.
+
+cd /hive/users/lrnassar/insightHub/functionalAssays
+python3 ~/kent/src/hg/makeDb/scripts/insight/insightFunctionalAssays.py
+
+# Output: insightFunctionalAssaysHg38.bb, insightFunctionalAssaysHg19.bb
+
+##############################################################################
+# Track 6: InSiGHT Curated Variants (from ClinVar)
+##############################################################################
+
+# InSiGHT VCEP expert panel classifications fetched from ClinVar API.
+# Queries ClinVar for variants submitted by InSiGHT on MLH1, MSH2, MSH6, PMS2.
+# No local data files needed -- fetches directly from NCBI E-utilities.
+# This is the track that should be periodically rebuilt (ClinVar updates monthly).
+# Generates bigBed 9+7 files for hg38 and hg19.
+
+cd /hive/users/lrnassar/insightHub/lovdVars
+python3 ~/kent/src/hg/makeDb/scripts/insight/buildInsightClinVar.py
+
+# Output: insightClinVarHg38.bb, insightClinVarHg19.bb
+
+##############################################################################
+# Hub deployment
+##############################################################################
+
+# The hub is served from:
+#   https://hgwdev-lrnassar.gi.ucsc.edu/~lrnassar/track_hubs/insightHub/hub.txt
+#
+# The public_html symlink points to the working directory:
+#   /cluster/home/lrnassar/public_html/track_hubs/insightHub -> /hive/users/lrnassar/insightHub
+#
+# To rebuild all tracks from scratch:
+cd /hive/users/lrnassar/insightHub
+cd clinDomains && python3 ~/kent/src/hg/makeDb/scripts/insight/insightClinDomains.py && cd ..
+cd pvs1 && python3 ~/kent/src/hg/makeDb/scripts/insight/insightPVS1.py && cd ..
+cd afFrequencies && python3 ~/kent/src/hg/makeDb/scripts/insight/insightAFfrequencies.py && cd ..
+cd hciPriors && python3 ~/kent/src/hg/makeDb/scripts/insight/insightHCIPriors.py && cd ..
+cd functionalAssays && python3 ~/kent/src/hg/makeDb/scripts/insight/insightFunctionalAssays.py && cd ..
+cd lovdVars && python3 ~/kent/src/hg/makeDb/scripts/insight/buildInsightClinVar.py && cd ..