526213b2893134217a300ff913e11b4e98d67991 max Mon Apr 20 08:50:10 2026 -0700 lrSv: add cpc1Sv and aprSv pangenome SV subtracks (hg38, hs1) cpc1Sv: 97,205 SVs from the CPC + HPRC Phase 1 pangenome (Gao et al 2023, Nature; PMID 37316654) built on T2T-CHM13v2, with 53 Chinese and 47 HPRC samples. Each graph snarl site is shown as one item with alt alleles classified by length delta (INS/DEL/CPX, 50 bp threshold) and collapsed. aprSv: 103,077 SVs from the Arabic Pangenome Reference (Nassir et al. 2025, Nat Commun; PMID 40707445) built on T2T-CHM13v2 from 53 UAE-resident Arab individuals. Same multi-allele classification as cpc1Sv, with alt alleles iterated within each multi-allelic row. Both tracks load natively on hs1 and are lifted to hg38 with hs1ToHg38.over.chain.gz. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvCpc1Build.sh src/hg/makeDb/scripts/lrSv/lrSvCpc1Build.sh new file mode 100755 index 00000000000..a12e65a1b82 --- /dev/null +++ src/hg/makeDb/scripts/lrSv/lrSvCpc1Build.sh @@ -0,0 +1,41 @@ +#!/usr/bin/env bash +# Build cpc1 lrSv subtrack for hs1 (native) and hg38 (lifted). +# Run from /hive/data/genomes/hg38/bed/lrSv/cpc1/ + +set -euo pipefail + +SCRIPTS=/cluster/home/max/kent/src/hg/makeDb/scripts/lrSv +AS="$SCRIPTS/lrSvCpc1.as" +VCF=CPC.HPRC.Phase1.processed.SVs.normed.vcf.gz +HS1_SIZES=/hive/data/genomes/hs1/chrom.sizes +HG38_SIZES=/hive/data/genomes/hg38/chrom.sizes +CHAIN=/gbdb/hs1/liftOver/hs1ToHg38.over.chain.gz + +# --- Build hs1 native bed --- +echo "[$(date +%T)] converting VCF to hs1 bed..." +zcat "$VCF" | python3 "$SCRIPTS/lrSvCpc1VcfToBed.py" /dev/stdin cpc1.hs1.bed "$HS1_SIZES" + +echo "[$(date +%T)] sorting hs1 bed..." +bedSort cpc1.hs1.bed cpc1.hs1.sorted.bed + +echo "[$(date +%T)] building hs1 bigBed..." +bedToBigBed -type=bed9+ -tab -as="$AS" \ + cpc1.hs1.sorted.bed "$HS1_SIZES" cpc1.hs1.bb + +# --- liftOver to hg38 --- +echo "[$(date +%T)] lifting to hg38..." +liftOver -tab -bedPlus=9 cpc1.hs1.bed "$CHAIN" \ + cpc1.hg38.bed cpc1.hs1.unmapped.bed + +echo " hg38 mapped: $(wc -l < cpc1.hg38.bed)" +echo " hg38 unmapped: $(grep -cv '^#' cpc1.hs1.unmapped.bed || true)" + +echo "[$(date +%T)] sorting hg38 bed..." +bedSort cpc1.hg38.bed cpc1.hg38.sorted.bed + +echo "[$(date +%T)] building hg38 bigBed..." +bedToBigBed -type=bed9+ -tab -as="$AS" \ + cpc1.hg38.sorted.bed "$HG38_SIZES" cpc1.hg38.bb + +ls -lh cpc1.hs1.bb cpc1.hg38.bb +echo "[$(date +%T)] done."