9eb4e0937782954c19d664e7d384d210bffb3b25 max Sat Jun 13 16:01:42 2026 -0700 lrSv: QA fixes from Lou's review - dedup, shared color palette, deCODE/AoU cleanup - Drop kwanhoSv (KimPD) from the lrSvAll merge in databases.tsv; it stays on dev/alpha until published, which also removes its >5 Mb breakend artifacts from the merged track. - Remove searchIndex from colorsDbSv, lrSv1kLin and lrSvAll (and the merge generator): the bigBeds were built without a name index, so by-name search never worked. - Single shared per-SV-type color palette in lrSvCommon.py (svColor), used by every converter and the merge. CPX is purple everywhere (was orange in 1kgOnt/apr/cpc1, colliding with INV's orange), colorsDb DEL is 200,0,0 like the rest, and TRA/INSDEL get their own colors. - deCODE: drop byte-identical duplicate rows and blank the fake AC=50 placeholder (AC is now a string field, omitted from the name and mouseOver). - AoU: numeric-entity-encode non-ASCII gene/trait text and drop duplicate rows. - gustafson, chirmade101, hprc2v21: drop byte-identical duplicate rows. - lrSvMergeAll.py: skip byte-identical duplicate source rows instead of summing their allele counts, which had inflated the per-database and total AC. refs #36258 diff --git src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py index 550f491c33e..e0c36c99526 100644 --- src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py +++ src/hg/makeDb/scripts/lrSv/lrSvAou1kCsvToBed.py @@ -1,156 +1,173 @@ #!/usr/bin/env python3 """Convert AoU 1K long-read SV CSV to BED9+ for bigBed. Input is a gzipped CSV (media-2.gz) with columns: SV_coordinate, SV_ID, SV type, SV length, Mean GenotypePosterior nonref, AF(AFR,AMR,EAS,EUR,SAS), Fst(AFR vs Non-AFR), OMIM genes, Disease genes, Cancer genes, ACMG genes, OMIM CDS, Disease CDS, Cancer CDS, ACMG CDS, Regulatory element, SegDUP, Tandem repeats, Other LR datasets, Detected in AoU SR, eQTLs, GWAS, SV-trait associations, SR validation, LR assembly-supported, Locityper validation Usage: lrSvAou1kCsvToBed.py input.csv.gz output.bed """ import csv import gzip import os import sys sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) -from lrSvCommon import svName, normalizeSvType +from lrSvCommon import svName, normalizeSvType, svColor # AoU Phase-I (2025) SV panel: 1,027 samples => 2,054 alleles (diploid) # No per-variant AC in the site-level release; approximate AC as # round(max_popAF * N_alleles) for naming purposes only. AOU_N_ALLELES = 2054 -SV_COLORS = { - "DEL": "200,0,0", - "INS": "0,0,200", -} + +def encodeNonAscii(s): + """Replace non-ASCII characters with numeric HTML entities so detail + pages render them correctly instead of as mojibake. The source CSV has + gene/trait text with accented names, Greek letters, curly quotes and + en-dashes (e.g. ö, β, ', –).""" + return "".join(c if ord(c) < 128 else f"&#{ord(c)};" for c in s) + def na(val): - """Return empty string for NA values.""" + """Return empty string for NA values, else the value with any non-ASCII + characters numeric-entity encoded.""" if val == "NA" or val == "No" or val == "": return "" - return val + return encodeNonAscii(val) def main(): if len(sys.argv) != 3: print(__doc__, file=sys.stderr) sys.exit(1) inFile, outFile = sys.argv[1], sys.argv[2] + seen = set() + nIn = 0 + nDup = 0 with gzip.open(inFile, "rt") as fIn, open(outFile, "w") as fOut: reader = csv.reader(fIn) header = next(reader) for row in reader: + nIn += 1 coord = row[0] # chr1:10627 svTypeRaw = row[2] svType = normalizeSvType(svTypeRaw) svLenSrc = int(row[3]) # Parse coordinate (1-based position) chrom, posStr = coord.split(":") pos = int(posStr) # BED is 0-based half-open chromStart = pos - 1 if svType == "DEL": chromEnd = chromStart + svLenSrc else: # INS: place at insertion site, 1 bp wide chromEnd = chromStart + 1 svLen = chromEnd - chromStart # insLen: the source "SV length" represents the INS payload for INS # and 0 for DEL (where it equals reference span) if svType in ("INS", "MEI"): insLen = svLenSrc else: insLen = 0 - color = SV_COLORS.get(svType, "100,100,100") + color = svColor(svType) # Parse population AFs (column 5): "0.001,0.002,0.003,0.004,0.005" afStr = row[5] afParts = afStr.split(",") try: afAfr = float(afParts[0]) afAmr = float(afParts[1]) afEas = float(afParts[2]) afEur = float(afParts[3]) afSas = float(afParts[4]) except (ValueError, IndexError): afAfr = afAmr = afEas = afEur = afSas = 0.0 fst = na(row[6]) # Gene intersections (use gene-level, skip CDS-level which is subset) omimGenes = na(row[7]) diseaseGenes = na(row[8]) cancerGenes = na(row[9]) acmgGenes = na(row[10]) regElement = na(row[15]) segDup = na(row[16]) tandemRepeat = na(row[17]) otherLr = na(row[18]) detectedSr = na(row[19]) eqtls = na(row[20]) gwas = na(row[21]) traitAssoc = na(row[22]) # Use max population AF as score (0-1000) maxAf = max(afAfr, afAmr, afEas, afEur, afSas) score = min(int(round(maxAf * 1000)), 1000) # AC: AoU site-level data doesn't publish AC; approximate with # round(maxAf * 2054) so the name has something informative. ac = int(round(maxAf * AOU_N_ALLELES)) featLen = insLen if svType in ("INS", "MEI") else svLen name = svName(svType, featLen, ac) bedRow = [ chrom, str(chromStart), str(chromEnd), name, str(score), ".", str(chromStart), str(chromEnd), color, svType, str(svLen), str(insLen), str(ac), f"{afAfr:.6f}", f"{afAmr:.6f}", f"{afEas:.6f}", f"{afEur:.6f}", f"{afSas:.6f}", fst, omimGenes, diseaseGenes, cancerGenes, acmgGenes, regElement, segDup, tandemRepeat, otherLr, detectedSr, eqtls, gwas, traitAssoc, ] - fOut.write("\t".join(bedRow) + "\n") + line_out = "\t".join(bedRow) + if line_out in seen: + nDup += 1 + continue + seen.add(line_out) + fOut.write(line_out + "\n") + + print(f"AoU 1K: {nIn:,} input records, {nDup:,} duplicate rows dropped, " + f"{nIn - nDup:,} written", file=sys.stderr) if __name__ == "__main__": main()