src/hg/utils/otto/clinvar/clinVarToBed cc9815dcc544d9bfaa660557feefa1ab4364ecb6

cc9815dcc544d9bfaa660557feefa1ab4364ecb6
max
  Thu Jun 19 05:45:56 2025 -0700
add triangles to clinvar tracks, add clinvar lollies to archive, add a validation to the clinvar sub lollies script, refs #35750

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index e0f02fee390..e3daaebbbc6 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -6,36 +6,37 @@
 from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "MONDO" : "https://monarchinitiative.org/disease/%s",
     "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
-# if they add a new one, this script must fail and tdb must be updated.
+# if Clinvar ever adds a new value, this script must fail and tdb must be updated.
 possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiator codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])
 
 # these consequences are highlighted using a triangle decorator. Similar to Decipher and Gnomad browsers
 truncConseqs = set(["nonsense", "frameshift variant", "splice acceptor variant", "splice donor variant"])
 
+# make sure that if the second list is edited, it is never out of sync with the first list
 assert(len(set(truncConseqs - possMolConseqs))==0)
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
 Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
 parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb")
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
@@ -759,51 +760,42 @@
 minusAccs = []
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId, posVcf, refAllVcf, altAllVcf, somClinImpact, somClinLastEval, revStatusClinImpact, oncogen, oncogenLastEval, revStatusOncogen, aggrGermClass, aggrSomClassImpact, aggrSomOnc = row
    # SomaticClinicalImpact|SomaticClinicalImpactLastEvaluated|ReviewStatusClinicalImpact|Oncogenicity|OncogenicityLastEvaluated|ReviewStatusOncogenicity
 
-    # changed in July 2018 when the "varId" was merged into this table. It used to be 
-    # in allToVar
-    #if varId=="":
-        #logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
-        #vcvId = ""
-    #else:
-        #vcvId = "VCV%09d.1" % int(varId)
-
     if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
         noAssCount += 1
         continue
 
     if assembly=="GRCh37":
         hgvs = hg19Hgvs
     elif assembly=="GRCh38":
         hgvs = hg38Hgvs
     else:
         print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row)))
         assert(False)
 
-    #hgvsTable =  ";".join(hgvs[alleleId])
     hgvsTable =  json.dumps(hgvs[alleleId])
 
     molConseq = allToVcf.get(int(alleleId))
     isTrunc = False
     if molConseq is None:
         noMcCount += 1
         molConseq = ""
     else:
         if molConseq in truncConseqs:
             isTrunc = True
 
     if chromAcc=="NT_187513.1":
         chrom = "chrUn_KI270742v1"
     elif chromAcc=="NT_167222.1":
         chrom = "chrUn_gl000228"
@@ -970,83 +962,88 @@
     if len(vcfDesc)>=255:
 	    refAllVcf = shortenSeq(refAllVcf)
 	    altAllVcf = shortenSeq(altAllVcf)
 	    vcfDesc = chrom+":"+posVcf+":"+refAllVcf+">"+altAllVcf
 
     somImpactDesc = ""
     if somClinImpact!="" and somClinImpact!="-":
 	    somImpactDesc = "%s (%s, %s)" % (somClinImpact, somClinLastEval, revStatusClinImpact)
 
     oncogenDesc = ""
     if oncogen!="" and oncogen!="-":
 	    oncogenDesc = "%s (%s, %s)" % (oncogen, oncogenLastEval, revStatusOncogen)
 
     mouseOverIdx = 33 # if you ever change the field order below, make sure that you adapt this number!
 
-    row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
+    row = [chrom, start, end, vcvId, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr, molConseq,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
         hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
 	    vcfDesc,
 	    somImpactDesc,
 	    oncogenDesc,
         # the new classes
         aggrGermClass, aggrSomClassImpact, aggrSomOnc,
         # these fields are the for the filters, not for the display
         pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
         # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
-        varId, dbVarSsvAcc, vcvId, vcvIdVersion
+        varId, dbVarSsvAcc, vcvId, vcvIdVersion,
+        # the name was moved to the back of the field list in 2025, as we needed a unique field to link to decorator
+        shortName
         ]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
             newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":
         ofh = hg19Bed
         decorOfh = hg19DecorBed
         if isCnv:
             ofh = hg19BedCnv
     elif assembly=="GRCh38":
         ofh = hg38Bed
         decorOfh = hg38DecorBed
         if isCnv:
             ofh = hg38BedCnv
     else:
         noAssCount +=1
         
     ofh.write("\t".join(row))
     ofh.write("\n")
 
     if isTrunc:
         # truncating mutations get a special symbol written into another BED file, for decorators
         decorRow = list(row[:12])
-        rgbColor = row[10]
+        rgbColor = row[8]
+        #decorRow[8] = "0,0,0" # line color
+        decorRow[8] = "255,255,255" # line color
         mouseOverText = row[mouseOverIdx]
 
         decorRef = row[0]+":"+row[1]+"-"+row[2]+":"+row[3]
         decorRow[3] = "" # delete mouseover
 
         decorRow.append(decorRef)
         decorRow.append("glyph")
-        decorRow.append(rgbColor)
+        #decorRow.append(rgbColor)
+        decorRow.append(rgbColor) # fill color
         decorRow.append("Triangle")
         decorRow.append(mouseOverText)
 
         decorOfh.write("\t".join(decorRow))
         decorOfh.write("\n")
 
 hg19Bed.close()
 hg38Bed.close()
 
 hg19DecorBed.close()
 hg38DecorBed.close()
 
 hg19BedCnv.close()
 hg38BedCnv.close()