src/hg/makeDb/doc/hs1/lrSv.txt 8a5a466f5e13a020954014cdefc81400072db516

8a5a466f5e13a020954014cdefc81400072db516
max
  Tue Apr 21 08:29:55 2026 -0700
lrSv: add hprc2 hs1 subtrack using T2T-CHM13 wave VCF, refs #36258

The HPRC release-2 pangenome publishes a wave-decomposed VCF against
both GRCh38 and T2T-CHM13. We already had the GRCh38 version as the
hprc2Sv subtrack on hg38; this adds the parallel T2T-CHM13 build under
/gbdb/hs1/lrSv/hprc2.bb. The existing trackDb stanza (bigDataUrl
/gbdb/$D/lrSv/hprc2.bb) picks it up on hs1 without changes.

1,451,269 SV rows kept (937,425 INS, 360,960 DEL, 147,898 COMPLEX,
4,986 INV) using the existing lrSvHprc2VcfToBed.py converter.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/doc/hs1/lrSv.txt src/hg/makeDb/doc/hs1/lrSv.txt
index 2ac7a1808ab..7bce21288d1 100644
--- src/hg/makeDb/doc/hs1/lrSv.txt
+++ src/hg/makeDb/doc/hs1/lrSv.txt
@@ -1,30 +1,51 @@
 # 2026-04-21 Claude max
 
 # Long-read SVs on hs1 (T2T-CHM13). HGSVC3 released a parallel set of SV
 # annotation tables native to T2T-CHM13, which we convert with the same
 # pipeline as the hg38 HGSVC3 subtrack. The full process (converter,
 # autoSql, bigBed build, trackDb setup, summary table, references) is
 # documented in ~/kent/src/hg/makeDb/doc/hg38/lrSv.txt; this file only
 # lists the hs1-specific shell steps.
 
 mkdir -p /hive/data/genomes/hs1/bed/lrSv/hgsvc3
 cd /hive/data/genomes/hs1/bed/lrSv/hgsvc3
 
 wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3/release/Variant_Calls/1.0/T2T-CHM13/annotation_table/variants_T2T-CHM13_sv_insdel_HGSVC2024v1.0.tsv.gz
 wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3/release/Variant_Calls/1.0/T2T-CHM13/annotation_table/variants_T2T-CHM13_sv_inv_HGSVC2024v1.0.tsv.gz
 
 # 188,224 DEL+INS + 276 INV = 188,500 SVs, natively on T2T-CHM13. The
 # converter is the same one used for the hg38 track (shared .as + .py).
 python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py \
     variants_T2T-CHM13_sv_insdel_HGSVC2024v1.0.tsv.gz \
     variants_T2T-CHM13_sv_inv_HGSVC2024v1.0.tsv.gz \
     hgsvc3.bed
 bedSort hgsvc3.bed hgsvc3.sorted.bed
 bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHgsvc3.as \
     -tab hgsvc3.sorted.bed /hive/data/genomes/hs1/chrom.sizes hgsvc3.bb
 
 # Symlink under /gbdb/hs1/lrSv with the same filename as the hg38 track,
 # so the trackDb bigDataUrl (/gbdb/$D/lrSv/hgsvc3.bb) resolves on both
 # assemblies.
 mkdir -p /gbdb/hs1/lrSv
 ln -sf /hive/data/genomes/hs1/bed/lrSv/hgsvc3/hgsvc3.bb /gbdb/hs1/lrSv/hgsvc3.bb
+
+##########
+# 2026-04-21 Claude max
+
+# HPRC release-2 pangenome SVs on T2T-CHM13. HPRC releases one VCF per
+# reference path; we already have the GRCh38 version as the hprc2Sv
+# subtrack. The hs1 track is built from the parallel T2T-CHM13 wave VCF
+# with the same converter.
+
+mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2
+cd /hive/data/genomes/hs1/bed/lrSv/hprc2
+
+aria2c -x10 https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/release2/minigraph-cactus/hprc-v2.0-mc-chm13.wave.vcf.gz
+
+python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py \
+    hprc-v2.0-mc-chm13.wave.vcf.gz hprc2.bed
+bedSort hprc2.bed hprc2.sorted.bed
+bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2.as \
+    -tab hprc2.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2.bb
+
+ln -sf /hive/data/genomes/hs1/bed/lrSv/hprc2/hprc2.bb /gbdb/hs1/lrSv/hprc2.bb