9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py index 550f491c33e..e0c36c99526 100644 --- src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py @@ -7,86 +7,95 @@ Cancer genes, ACMG genes, OMIM CDS, Disease CDS, Cancer CDS, ACMG CDS, Regulatory element, SegDUP, Tandem repeats, Other LR datasets, Detected in AoU SR, eQTLs, GWAS, SV-trait associations, SR validation, LR assembly-supported, Locityper validation Usage: lrSvAou1kCsvToBed.py input.csv.gz output.bed """ import csv import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType +from lrSvCommon import svName, normalizeSvType, svColor # AoU Phase-I (2025) SV panel: 1,027 samples => 2,054 alleles (diploid) # No per-variant AC in the site-level release; approximate AC as # round(max_popAF * N_alleles) for naming purposes only. AOU_N_ALLELES = 2054 -SV_COLORS = { - "DEL": "200,0,0", - "INS": "0,0,200", -} + +def encodeNonAscii(s): + """Replace non-ASCII characters with numeric HTML entities so detail + pages render them correctly instead of as mojibake. The source CSV has + gene/trait text with accented names, Greek letters, curly quotes and + en-dashes (e.g. ö, β, ', –).""" + return "".join(c if ord(c) < 128 else f"&#{ord(c)};" for c in s) + def na(val): - """Return empty string for NA values.""" + """Return empty string for NA values, else the value with any non-ASCII + characters numeric-entity encoded.""" if val == "NA" or val == "No" or val == "": return "" - return val + return encodeNonAscii(val) def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inFile, outFile = sys.argv[1], sys.argv[2] + seen = set() + nIn = 0 + nDup = 0 with gzip.open(inFile, "rt") as fIn, open(outFile, "w") as fOut: reader = csv.reader(fIn) header = next(reader) for row in reader: + nIn += 1 coord = row[0] # chr1:10627 svTypeRaw = row[2] svType = normalizeSvType(svTypeRaw) svLenSrc = int(row[3]) # Parse coordinate (1-based position) chrom, posStr = coord.split(":") pos = int(posStr) # BED is 0-based half-open chromStart = pos - 1 if svType == "DEL": chromEnd = chromStart + svLenSrc else: # INS: place at insertion site, 1 bp wide chromEnd = chromStart + 1 svLen = chromEnd - chromStart # insLen: the source "SV length" represents the INS payload for INS # and 0 for DEL (where it equals reference span) if svType in ("INS", "MEI"): insLen = svLenSrc else: insLen = 0 - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # Parse population AFs (column 5): "0.001,0.002,0.003,0.004,0.005" afStr = row[5] afParts = afStr.split(",") try: afAfr = float(afParts[0]) afAmr = float(afParts[1]) afEas = float(afParts[2]) afEur = float(afParts[3]) afSas = float(afParts[4]) except (ValueError, IndexError): afAfr = afAmr = afEas = afEur = afSas = 0.0 fst = na(row[6]) @@ -138,19 +147,27 @@ f"{afSas:.6f}", fst, omimGenes, diseaseGenes, cancerGenes, acmgGenes, regElement, segDup, tandemRepeat, otherLr, detectedSr, eqtls, gwas, traitAssoc, ] - fOut.write("\t".join(bedRow) + "\n") + line_out = "\t".join(bedRow) + if line_out in seen: + nDup += 1 + continue + seen.add(line_out) + fOut.write(line_out + "\n") + + print(f"AoU 1K: {nIn:,} input records, {nDup:,} duplicate rows dropped, " + f"{nIn - nDup:,} written", file=sys.stderr) if __name__ == "__main__": main()