cc9815dcc544d9bfaa660557feefa1ab4364ecb6 max Thu Jun 19 05:45:56 2025 -0700 add triangles to clinvar tracks, add clinvar lollies to archive, add a validation to the clinvar sub lollies script, refs #35750 diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index e0f02fee390..e3daaebbbc6 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -6,36 +6,37 @@ from os.path import join, basename, dirname, isfile, abspath, isdir from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "MONDO" : "https://monarchinitiative.org/disease/%s", "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file -# if they add a new one, this script must fail and tdb must be updated. +# if Clinvar ever adds a new value, this script must fail and tdb must be updated. possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiator codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""]) # these consequences are highlighted using a triangle decorator. Similar to Decipher and Gnomad browsers truncConseqs = set(["nonsense", "frameshift variant", "splice acceptor variant", "splice donor variant"]) +# make sure that if the second list is edited, it is never out of sync with the first list assert(len(set(truncConseqs - possMolConseqs))==0) # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed. Output goes into the current dir Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/ """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") @@ -759,51 +760,42 @@ minusAccs = [] # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen, aggrGermClass, aggrSomClassImpact, aggrSomOnc = row # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity - # changed in July 2018 when the "varId" was merged into this table. It used to be - # in allToVar - #if varId=="": - #logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) - #vcvId = "" - #else: - #vcvId = "VCV%09d.1" % int(varId) - if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na": noAssCount += 1 continue if assembly=="GRCh37": hgvs = hg19Hgvs elif assembly=="GRCh38": hgvs = hg38Hgvs else: print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row))) assert(False) - #hgvsTable = ";".join(hgvs[alleleId]) hgvsTable = json.dumps(hgvs[alleleId]) molConseq = allToVcf.get(int(alleleId)) isTrunc = False if molConseq is None: noMcCount += 1 molConseq = "" else: if molConseq in truncConseqs: isTrunc = True if chromAcc=="NT_187513.1": chrom = "chrUn_KI270742v1" elif chromAcc=="NT_167222.1": chrom = "chrUn_gl000228" @@ -970,83 +962,88 @@ if len(vcfDesc)>=255: refAllVcf = shortenSeq(refAllVcf) altAllVcf = shortenSeq(altAllVcf) vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf somImpactDesc = "" if somClinImpact!="" and somClinImpact!="-": somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact) oncogenDesc = "" if oncogen!="" and oncogen!="-": oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen) mouseOverIdx = 33 # if you ever change the field order below, make sure that you adapt this number! - row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ + row = [chrom, start, end, vcvId, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver, vcfDesc, somImpactDesc, oncogenDesc, # the new classes aggrGermClass, aggrSomClassImpact, aggrSomOnc, # these fields are the for the filters, not for the display pathoCode, originCode, allTypeCode, str(varLen), str(starCount), # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field - varId, dbVarSsvAcc, vcvId, vcvIdVersion + varId, dbVarSsvAcc, vcvId, vcvIdVersion, + # the name was moved to the back of the field list in 2025, as we needed a unique field to link to decorator + shortName ] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed decorOfh = hg19DecorBed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38": ofh = hg38Bed decorOfh = hg38DecorBed if isCnv: ofh = hg38BedCnv else: noAssCount +=1 ofh.write("\t".join(row)) ofh.write("\n") if isTrunc: # truncating mutations get a special symbol written into another BED file, for decorators decorRow = list(row[:12]) - rgbColor = row[10] + rgbColor = row[8] + #decorRow[8] = "0,0,0" # line color + decorRow[8] = "255,255,255" # line color mouseOverText = row[mouseOverIdx] decorRef = row[0]+":"+row[1]+"-"+row[2]+":"+row[3] decorRow[3] = "" # delete mouseover decorRow.append(decorRef) decorRow.append("glyph") - decorRow.append(rgbColor) + #decorRow.append(rgbColor) + decorRow.append(rgbColor) # fill color decorRow.append("Triangle") decorRow.append(mouseOverText) decorOfh.write("\t".join(decorRow)) decorOfh.write("\n") hg19Bed.close() hg38Bed.close() hg19DecorBed.close() hg38DecorBed.close() hg19BedCnv.close() hg38BedCnv.close()