src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py 9eb4e0937782954c19d664e7d384d210bffb3b25

9eb4e0937782954c19d664e7d384d210bffb3b25
max
  Sat Jun 13 16:01:42 2026 -0700
lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup

- Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on
dev/alpha until published, which also removes its >5 Mb breakend artifacts
from the merged track.
- Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge
generator): the bigBeds were built without a name index, so by-name search
never worked.
- Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by
every converter and the merge. CPX is purple everywhere (was orange in
1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like
the rest, and TRA/INSDEL get their own colors.
- deCODE: drop byte-identical duplicate rows and blank the fake AC=50
placeholder (AC is now a string field, omitted from the name and mouseOver).
- AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows.
- gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows.
- lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing
their allele counts, which had inflated the per-database and total AC.

refs #36258

diff --git src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py
index 7859c2cedb7..cb2cfe65186 100644
--- src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py
+++ src/hg/makeDb/scripts/lrSv/lrSv1kgOntVcfToBed.py
@@ -1,246 +1,239 @@
 #!/usr/bin/env python3
 """Convert 1KG ONT SVAN VCF to BED9+ for bigBed, adding allele counts from phased VCF.
 
 Usage:
     lrSv1kgOntVcfToBed.py svan.vcf.gz output.bed chrom.sizes [phased.vcf.gz]
 
 The optional phased VCF provides AC, AN, and AF per variant (matched by ID).
 SVs without a match get AC=-1 (written to AC column), alleleNumber=-1, alleleFreq=-1.
 The canonical `name` column omits the :AC suffix when AC is -1.
 """
 
 import gzip
 import os
 import sys
 
 sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
-from lrSvCommon import svName, normalizeSvType
-
-# Colors by SV class (R,G,B)
-SV_COLORS = {
-    "INS": "0,0,200",      # blue
-    "DEL": "200,0,0",      # red
-    "CPX": "230,140,0",    # orange
-}
+from lrSvCommon import svName, normalizeSvType, svColor
 
 def parseInfo(infoStr):
     """Parse INFO field into a dict."""
     d = {}
     for item in infoStr.split(";"):
         if "=" in item:
             k, v = item.split("=", 1)
             d[k] = v
         else:
             d[item] = True
     return d
 
 def getSvClass(varId):
     """Extract raw SV class from the variant ID. Returns raw string;
     will be normalized by normalizeSvType (COMPLEX -> CPX)."""
     if "-INS-" in varId:
         return "INS"
     elif "-DEL-" in varId:
         return "DEL"
     elif "-COMPLEX-" in varId:
         return "COMPLEX"
     return "OTHER"
 
 def loadPhasedAC(phasedFile):
     """Load AC, AN, AF from phased VCF, keyed by variant ID."""
     acMap = {}
     opener = gzip.open if phasedFile.endswith(".gz") else open
     with opener(phasedFile, "rt") as f:
         for line in f:
             if line.startswith("#"):
                 continue
             fields = line.split("\t", 9)
             varId = fields[2]
             info = parseInfo(fields[7])
             ac = int(info.get("AC", "0"))
             an = int(info.get("AN", "0"))
             af = float(info.get("AF", "0"))
             acMap[varId] = (ac, an, af)
     return acMap
 
 def main():
     if len(sys.argv) < 4:
         print(__doc__, file=sys.stderr)
         sys.exit(1)
 
     inFile = sys.argv[1]
     outFile = sys.argv[2]
     chromSizesFile = sys.argv[3]
     phasedFile = sys.argv[4] if len(sys.argv) > 4 else None
     opener = gzip.open if inFile.endswith(".gz") else open
 
     # Load chrom sizes for clipping
     chromSizes = {}
     with open(chromSizesFile) as f:
         for line in f:
             c, s = line.strip().split("\t")
             chromSizes[c] = int(s)
 
     # Load allele counts from phased VCF
     acMap = {}
     if phasedFile:
         print(f"Loading allele counts from {phasedFile}...", file=sys.stderr)
         acMap = loadPhasedAC(phasedFile)
         print(f"Loaded {len(acMap)} variants from phased VCF", file=sys.stderr)
 
     skipped = 0
     matched = 0
     unmatched = 0
     with opener(inFile, "rt") as fIn, open(outFile, "w") as fOut:
         for line in fIn:
             if line.startswith("#"):
                 continue
 
             fields = line.rstrip("\n").split("\t")
             chrom = fields[0]
             pos = int(fields[1])
             varId = fields[2]
             ref = fields[3]
             info = parseInfo(fields[7])
 
             svTypeRaw = getSvClass(varId)
             svType = normalizeSvType(svTypeRaw)
 
             # BED coordinates: 0-based half-open
             chromStart = pos - 1
             chromEnd = chromStart + len(ref)
 
             # Source lengths: INS_LEN for insertion size, DEL_LEN for deletion size
             insLenSrc = int(info.get("INS_LEN", "0"))
             delLenSrc = int(info.get("DEL_LEN", "0"))
 
             # For INS, the item spans only the anchor base; expand by 1 for visibility
             if svType == "INS" and chromEnd <= chromStart + 1:
                 chromEnd = chromStart + 1
 
             # svLen = span on reference
             svLen = chromEnd - chromStart
             # insLen = length of inserted sequence (INS only); 0 otherwise
             if svType == "INS":
                 insLen = insLenSrc
             elif svType == "DEL":
                 # DEL_LEN matches reference span
                 insLen = 0
             else:
                 insLen = 0
 
             # Insertion/deletion type
             insType = info.get("ITYPE_N", info.get("DTYPE_N", ""))
 
             # Transposon family
             family = info.get("FAM_N", "")
 
             # Percent resolved
             try:
                 percResolved = float(info.get("PERC_RESOLVED", "0"))
             except ValueError:
                 percResolved = 0.0
 
             # TSD length
             try:
                 tsdLen = int(info.get("TSD_LEN", "0"))
             except ValueError:
                 tsdLen = 0
 
             # Poly-A length
             try:
                 polyaLen = int(info.get("POLYA_LEN", "0"))
             except ValueError:
                 polyaLen = 0
 
             # Conformation
             conformation = info.get("CONFORMATION", "")
 
             # Retrotransposon length
             try:
                 rtLen = int(info.get("RT_LEN", "0"))
             except ValueError:
                 rtLen = 0
 
             # VNTR motif count
             try:
                 nbMotifs = int(info.get("NB_MOTIFS", "0"))
             except ValueError:
                 nbMotifs = 0
 
             # Source gene (for processed pseudogenes)
             srcGene = info.get("SRC_GENE", "")
 
             # Number of exons retrotransposed
             try:
                 nbExons = int(info.get("NB_EXONS", "0"))
             except ValueError:
                 nbExons = 0
 
             # Non-canonical MEI flag
             notCanonical = "Yes" if info.get("NOT_CANONICAL") else ""
 
             # Strand
             strand = info.get("STRAND", ".")
             if strand not in ("+", "-"):
                 strand = "."
 
-            color = SV_COLORS.get(svType, "100,100,100")
+            color = svColor(svType)
 
             # Allele counts from phased VCF
             if varId in acMap:
                 ac, an, af = acMap[varId]
                 matched += 1
                 acForName = ac
             else:
                 ac, an, af = -1, -1, -1.0
                 unmatched += 1
                 acForName = None  # drop :AC suffix for SVAN-only rows
 
             # Clip to chrom sizes; skip records on unknown chroms
             if chrom not in chromSizes:
                 skipped += 1
                 continue
             if chromEnd > chromSizes[chrom]:
                 skipped += 1
                 continue
 
             featLen = insLen if svType in ("INS", "MEI") else svLen
             name = svName(svType, featLen, acForName)
 
             row = [
                 chrom,
                 str(chromStart),
                 str(chromEnd),
                 name,
                 "0",
                 strand,
                 str(chromStart),   # thickStart
                 str(chromEnd),     # thickEnd
                 color,
                 svType,
                 str(svLen),
                 str(insLen),
                 str(ac),
                 insType,
                 family,
                 f"{percResolved:.2f}",
                 str(tsdLen),
                 str(polyaLen),
                 conformation,
                 str(rtLen),
                 str(nbMotifs),
                 srcGene,
                 str(nbExons),
                 notCanonical,
                 str(an),
                 f"{af:.4f}" if af >= 0 else "-1.0000",
             ]
             fOut.write("\t".join(row) + "\n")
 
     print(f"Skipped {skipped} records outside chrom sizes", file=sys.stderr)
     if acMap:
         print(f"Allele counts: {matched} matched, {unmatched} unmatched "
               f"({100*matched/(matched+unmatched):.1f}% matched)", file=sys.stderr)
 
 if __name__ == "__main__":
     main()