ef61e73fc416622d8557ec2439df2344a1cc80c3 max Tue Jun 9 15:10:01 2026 -0700 lrSv: replace HPRC v2.0 pangenome SV track with v2.1 (hprc2v21Sv) Drop the v2.0 wave-decomposed hprc2Sv track and add hprc2v21Sv built from the HPRC v2.1 minigraph-cactus raw vg deconstruct VCFs (gref95.ro), on both hg38 (GRCh38 path, 596,063 SVs) and hs1 (T2T-CHM13 path, 608,435 SVs). The v2.1 files lack per-allele TYPE/LEN, so the new converter classifies INS/DEL by parsimony-trimming REF/ALT and the net length change. The v2.0 build recipe, converter and schema are kept but commented out in the makeDocs in case wave-decomposed VCFs are released again, refs #36258 diff --git src/hg/makeDb/doc/hs1/lrSv.txt src/hg/makeDb/doc/hs1/lrSv.txt index 7bce21288d1..20c594d0a48 100644 --- src/hg/makeDb/doc/hs1/lrSv.txt +++ src/hg/makeDb/doc/hs1/lrSv.txt @@ -19,33 +19,63 @@ variants_T2T-CHM13_sv_insdel_HGSVC2024v1.0.tsv.gz \ variants_T2T-CHM13_sv_inv_HGSVC2024v1.0.tsv.gz \ hgsvc3.bed bedSort hgsvc3.bed hgsvc3.sorted.bed bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHgsvc3.as \ -tab hgsvc3.sorted.bed /hive/data/genomes/hs1/chrom.sizes hgsvc3.bb # Symlink under /gbdb/hs1/lrSv with the same filename as the hg38 track, # so the trackDb bigDataUrl (/gbdb/$D/lrSv/hgsvc3.bb) resolves on both # assemblies. mkdir -p /gbdb/hs1/lrSv ln -sf /hive/data/genomes/hs1/bed/lrSv/hgsvc3/hgsvc3.bb /gbdb/hs1/lrSv/hgsvc3.bb ########## # 2026-04-21 Claude max +# +# OUTDATED, DO NOT RUN. The HPRC v2.0 wave track (hprc2Sv) was removed from +# trackDb on 2026-06-08 when the hg38 track moved to the v2.1 file +# (hprc2v21Sv, doc/hg38/lrSv.txt). The CHM13 v2.1 build is the next section +# below; this v2.0 recipe is kept, commented out, in case HPRC releases +# wave-decomposed VCFs again (the v2.0 converter handles their TYPE/LEN/INV +# fields, which the v2.1 raw deconstruct files do not have). +# +# # HPRC release-2 pangenome SVs on T2T-CHM13. HPRC releases one VCF per +# # reference path; we already have the GRCh38 version as the hprc2Sv +# # subtrack. The hs1 track is built from the parallel T2T-CHM13 wave VCF +# # with the same converter. +# +# mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2 +# cd /hive/data/genomes/hs1/bed/lrSv/hprc2 +# +# aria2c -x10 https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/release2/minigraph-cactus/hprc-v2.0-mc-chm13.wave.vcf.gz +# +# python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py \ +# hprc-v2.0-mc-chm13.wave.vcf.gz hprc2.bed +# bedSort hprc2.bed hprc2.sorted.bed +# bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2.as \ +# -tab hprc2.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2.bb +# +# ln -sf /hive/data/genomes/hs1/bed/lrSv/hprc2/hprc2.bb /gbdb/hs1/lrSv/hprc2.bb -# HPRC release-2 pangenome SVs on T2T-CHM13. HPRC releases one VCF per -# reference path; we already have the GRCh38 version as the hprc2Sv -# subtrack. The hs1 track is built from the parallel T2T-CHM13 wave VCF -# with the same converter. +########## +# 2026-06-09 Claude max -mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2 -cd /hive/data/genomes/hs1/bed/lrSv/hprc2 +# hprc2v21Sv on T2T-CHM13: the CHM13 path of the HPRC v2.1 minigraph-cactus +# graph (233 samples). Same raw vg deconstruct file type as the hg38 v2.1 +# build (no per-allele TYPE/LEN, graph traversals and nested snarls present), +# converted with the same parsimony-trimming converter. CHM13 contig names +# (chr1..chrY) already match the hs1 assembly, so no renaming is needed. -aria2c -x10 https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/release2/minigraph-cactus/hprc-v2.0-mc-chm13.wave.vcf.gz +mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2v21 +cd /hive/data/genomes/hs1/bed/lrSv/hprc2v21 -python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py \ - hprc-v2.0-mc-chm13.wave.vcf.gz hprc2.bed -bedSort hprc2.bed hprc2.sorted.bed -bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2.as \ - -tab hprc2.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2.bb +# VCF provided by Glenn Hickey (HPRC graph team): +wget https://public.gi.ucsc.edu/~ghickey/debug/hprc-v2.1-mc-chm13.gref95.ro.vcf.gz -ln -sf /hive/data/genomes/hs1/bed/lrSv/hprc2/hprc2.bb /gbdb/hs1/lrSv/hprc2.bb +python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py \ + hprc-v2.1-mc-chm13.gref95.ro.vcf.gz hprc2v21.bed +# kept 608435 SV-sized alleles: 363310 INS, 245125 DEL, 0 CPX +# (75809 at nested snarl levels LV>0) +bedSort hprc2v21.bed hprc2v21.sorted.bed +bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2Ro.as \ + -tab hprc2v21.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2v21.bb