src/hg/makeDb/doc/hs1/lrSv.txt 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/doc/hs1/lrSv.txt src/hg/makeDb/doc/hs1/lrSv.txt
index 20c594d0a48..d2269d7ebcb 100644
--- src/hg/makeDb/doc/hs1/lrSv.txt
+++ src/hg/makeDb/doc/hs1/lrSv.txt
@@ -1,81 +1,93 @@
 # 2026-04-21 Claude max
 
 # Long-read SVs on hs1 (T2T-CHM13). HGSVC3 released a parallel set of SV
 # annotation tables native to T2T-CHM13, which we convert with the same
 # pipeline as the hg38 HGSVC3 subtrack. The full process (converter,
 # autoSql, bigBed build, trackDb setup, summary table, references) is
 # documented in ~/kent/src/hg/makeDb/doc/hg38/lrSv.txt; this file only
 # lists the hs1-specific shell steps.
 
 mkdir -p /hive/data/genomes/hs1/bed/lrSv/hgsvc3
 cd /hive/data/genomes/hs1/bed/lrSv/hgsvc3
 
 wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3/release/Variant_Calls/1.0/T2T-CHM13/annotation_table/variants_T2T-CHM13_sv_insdel_HGSVC2024v1.0.tsv.gz
 wget https://ftp.1000genomes.ebi.ac.uk/vol1/ftp/data_collections/HGSVC3/release/Variant_Calls/1.0/T2T-CHM13/annotation_table/variants_T2T-CHM13_sv_inv_HGSVC2024v1.0.tsv.gz
 
 # 188,224 DEL+INS + 276 INV = 188,500 SVs, natively on T2T-CHM13. The
 # converter is the same one used for the hg38 track (shared .as + .py).
 python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHgsvc3TsvToBed.py \
     variants_T2T-CHM13_sv_insdel_HGSVC2024v1.0.tsv.gz \
     variants_T2T-CHM13_sv_inv_HGSVC2024v1.0.tsv.gz \
     hgsvc3.bed
 bedSort hgsvc3.bed hgsvc3.sorted.bed
 bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHgsvc3.as \
     -tab hgsvc3.sorted.bed /hive/data/genomes/hs1/chrom.sizes hgsvc3.bb
 
 # Symlink under /gbdb/hs1/lrSv with the same filename as the hg38 track,
 # so the trackDb bigDataUrl (/gbdb/$D/lrSv/hgsvc3.bb) resolves on both
 # assemblies.
 mkdir -p /gbdb/hs1/lrSv
 ln -sf /hive/data/genomes/hs1/bed/lrSv/hgsvc3/hgsvc3.bb /gbdb/hs1/lrSv/hgsvc3.bb
 
 ##########
 # 2026-04-21 Claude max
 #
 # OUTDATED, DO NOT RUN. The HPRC v2.0 wave track (hprc2Sv) was removed from
 # trackDb on 2026-06-08 when the hg38 track moved to the v2.1 file
 # (hprc2v21Sv, doc/hg38/lrSv.txt). The CHM13 v2.1 build is the next section
 # below; this v2.0 recipe is kept, commented out, in case HPRC releases
 # wave-decomposed VCFs again (the v2.0 converter handles their TYPE/LEN/INV
 # fields, which the v2.1 raw deconstruct files do not have).
 #
 # # HPRC release-2 pangenome SVs on T2T-CHM13. HPRC releases one VCF per
 # # reference path; we already have the GRCh38 version as the hprc2Sv
 # # subtrack. The hs1 track is built from the parallel T2T-CHM13 wave VCF
 # # with the same converter.
 #
 # mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2
 # cd /hive/data/genomes/hs1/bed/lrSv/hprc2
 #
 # aria2c -x10 https://s3-us-west-2.amazonaws.com/human-pangenomics/pangenomes/freeze/release2/minigraph-cactus/hprc-v2.0-mc-chm13.wave.vcf.gz
 #
 # python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2VcfToBed.py \
 #     hprc-v2.0-mc-chm13.wave.vcf.gz hprc2.bed
 # bedSort hprc2.bed hprc2.sorted.bed
 # bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2.as \
 #     -tab hprc2.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2.bb
 #
 # ln -sf /hive/data/genomes/hs1/bed/lrSv/hprc2/hprc2.bb /gbdb/hs1/lrSv/hprc2.bb
 
 ##########
 # 2026-06-09 Claude max
 
 # hprc2v21Sv on T2T-CHM13: the CHM13 path of the HPRC v2.1 minigraph-cactus
 # graph (233 samples). Same raw vg deconstruct file type as the hg38 v2.1
 # build (no per-allele TYPE/LEN, graph traversals and nested snarls present),
 # converted with the same parsimony-trimming converter. CHM13 contig names
 # (chr1..chrY) already match the hs1 assembly, so no renaming is needed.
 
 mkdir -p /hive/data/genomes/hs1/bed/lrSv/hprc2v21
 cd /hive/data/genomes/hs1/bed/lrSv/hprc2v21
 
 # VCF provided by Glenn Hickey (HPRC graph team):
 wget https://public.gi.ucsc.edu/~ghickey/debug/hprc-v2.1-mc-chm13.gref95.ro.vcf.gz
 
 python3 ~/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2RoVcfToBed.py \
     hprc-v2.1-mc-chm13.gref95.ro.vcf.gz hprc2v21.bed
 # kept 608435 SV-sized alleles: 363310 INS, 245125 DEL, 0 CPX
 # (75809 at nested snarl levels LV>0)
 bedSort hprc2v21.bed hprc2v21.sorted.bed
 bedToBigBed -type=bed9+ -as=$HOME/kent/src/hg/makeDb/scripts/lrSv/lrSvHprc2Ro.as \
     -tab hprc2v21.sorted.bed /hive/data/genomes/hs1/chrom.sizes hprc2v21.bb
+
+##########
+# 2026-06-13 Claude max
+#
+# QA fixes (refs #36258), hs1 side. See doc/hg38/lrSv.txt for the full writeup.
+# The converters now drop byte-identical duplicate rows, so re-running the
+# hs1 builds above gives the new counts:
+#   hprc2v21 hs1  608,435 -> 541,176  (67,259 duplicates dropped)
+# colorsDb hs1 and 1kgOnt hs1 were rebuilt from source for the shared color
+# palette (CPX purple, colorsDb DEL 200,0,0); apr hs1 and cpc1 hs1 only needed
+# the CPX color remapped, so their served bigBeds were recolored in place the
+# same way as the hg38 files (see doc/hg38/lrSv.txt). There is no hs1 lrSvAll.