src/hg/utils/otto/clinvar/clinVarToBed 70dea5ffaf352ada83e54b03aa0774bf515cbb65

70dea5ffaf352ada83e54b03aa0774bf515cbb65
max
  Mon Sep 21 03:39:46 2020 -0700
adding the Benet-Pages colorscheme to clinvar, fixing up the 'g.' names
and auto-loading the clinvarSub table for Braney's new bigLolli clinvar
submissions track. Also fixing the categorization of submissions. refs #25207, #25405 and emails with Chris/Braney.

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index c3b479d..de72ff2 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,18 +1,19 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
+import tempfile
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
 # if they add a new one, this script must fail and tdb must be updated.
@@ -25,33 +26,120 @@
 
 input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") 
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
-clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID\n"
+clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
 # ==== FUNCTIONs =====
 
+# benign = B, likely benign = LB, conflicting = CF, likely pathogenic= LP, 
+# pathogenic = P, other = OT, uncertain = VUS, RF=risk factor
+# colors were defined by Ana Benet Pages
+# see colortable at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing
+cnvColors = {
+        "INS" : {
+            "OT" : "225,165,0",
+            "B" : "250,214,69",
+            "LB" : "250,214,69",
+            "CF" : "225,165,0",
+            "RF" : "225,165,0",
+            "VUS" : "225,165,0",
+            "LP" : "199,114,3",
+            "P" : "199,114,3"
+            },
+        "LOSS" : {
+            "OT" : "255,0,0",
+            "B" : "255,98,119",
+            "LB" : "255,98,119",
+            "CF" : "255,0,0",
+            "VUS" : "255,0,0",
+            "RF" : "255,0,0",
+            "LP" : "153,0,0",
+            "P" : "153,0,0"
+            },
+        "GAIN" : {
+            "OT" : "0,0,225",
+            "B" : "77,184,255",
+            "LB" : "77,184,255",
+            "CF" : "0,0,225",
+            "VUS" : "0,0,225",
+            "RF" : "0,0,225",
+            "LP" : "0,0,179",
+            "P" : "0,0,179"
+            },
+        "STRUCT" : {
+            "OT" : "0,210,0",
+            "B" : "0,255,0",
+            "LB" : "0,255,0",
+            "CF" : "0,210,0",
+            "VUS" : "0,210,0",
+            "RF" : "0,210,0",
+            "LP" : "0,128,0",
+            "P" : "0,128,0"
+            },
+        "INV" : {
+            "OT" : "192,0,192",
+            "B" : "255,128,255",
+            "LB" : "255,128,255",
+            "CF" : "192,0,192",
+            "VUS" : "192,0,192",
+            "RF" : "192,0,192",
+            "LP" : "128,0,128",
+            "P" : "128,0,128"
+            },
+        "SEQALT" : {
+            "OT" : "128,128,128",
+            "B" : "191,191,191",
+            "LB" : "191,191,191",
+            "CF" : "128,128,128",
+            "VUS" : "128,128,128",
+            "RF" : "128,128,128",
+            "LP" : "64,64,64",
+            "P" : "64,64,64"
+            },
+        "SEQLEN" : {
+            "OT" : "0,179,179",
+            "B" : "0,255,255",
+            "LB" : "0,255,255",
+            "CF" : "0,179,179",
+            "VUS" : "0,179,179",
+            "RF" : "0,179,179",
+            "LP" : "0,102,102",
+            "P" : "0,102,102"
+            },
+        "SUBST" : {
+            "OT" : "128,128,128",
+            "B" : "191,191,191",
+            "LB" : "191,191,191",
+            "CF" : "128,128,128",
+            "VUS" : "128,128,128",
+            "RF" : "128,128,128",
+            "LP" : "64,64,64",
+            "P" : "64,64,64"
+            },
+}
+
 def mustRun(cmd):
     logging.debug(cmd)
     ret = os.system(cmd)
     if ret!=0:
         print(("Could not run: %s" % cmd))
         sys.exit(1)
 
 def shortenName(name):
     "make the clinvar name shorter and keep a long version for the mouse over "
     cnvMatch = cnvRe.match(name)
     #hgvsMatch = hgvsRe.match(name)
 
     if ":" in name and not "chr" in name:
         longName = name.split(":")[0]
     else:
@@ -63,96 +151,120 @@
         # some names include the original assembly as a prefix
         band, copy = cnvMatch.groups()
         if copy == None:
             copy = ""
         shortName = band+copy
     elif name.startswith("GRCh"):
         # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
         print(name)
         assert False # name that starts with GRCh but does not match our regex should not happen
     elif "." in name:
         # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del)
         # so we remove the gene and the protein description
         if ":" in name:
             name = name.split(":")[1]
             longName = name.split(" ")[0]
+
+        # handle genomic HGVS NC_000010.11:g.(?_87925503)_(87965482_?)del
+        if name.startswith("NC_") and "g." in name and ")" in name:
+            shortName = name
+        else:
+            # strip away the final description like " (p.Ala184del)"
             name = name.split(" ")[0].split("[")[0].split("(")[0]
             shortName = name
 
         if name.endswith("del"):
             shortName = "del"
         elif name.endswith("ins"):
             shortName = "ins"
         elif name.endswith("dup"):
             shortName = "dup"
         else:
             posMatch = posRe.match(name)
             if posMatch!=None:
                 pos, dupDelIns, change = posMatch.groups()
                 if dupDelIns==None:
                     dupDelIns= ""
 
                 if change==None or change=="":
                     shortName = pos+dupDelIns
                 else:
                     shortName = dupDelIns+change
 
     return shortName, longName
 
-def allTypeToCode(allType):
+def allTypeToCode(allType, debugId):
     """
     Map the allele type field from ClinVal to one of these classes for filtering:
-        SNV = single nucleotide variant
-        INS = insertion
-        DEL = deletion
-        INDEL = indel
-        DUPL = duplication
-        OTH = other
+        STRUCT = structual alteration
+        LOSS = deletion, copy number loss, mobile element deletion
+        GAIN = duplication, tandem dupl, intrachrom transp, copy number gain
+        INS = insertion, mobile or novel element insertion
+        INV = inversion
+        SEQALT = catch-all category: multiallele cnv, undetermined, others, not defined
+        SEQLEN = short tandem repeat, short repeat, rep exp, rep contraction, NT expansion
+        SUBST = SNV, MNV, multi nucl polymophrism, indel
+        OTH = protein only
+
+        # categories were defined by Ana Benet Pages
+        # see table at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing
 
     In 3/2020, these were all possible values:
 
         fusion  6       0.000%
         complex 61      0.005%
         protein only    103     0.008%
         NT expansion    142     0.011%
         Translocation   302     0.022%
         inversion       583     0.043%
         undetermined variant    866     0.064%
         insertion       7133    0.530%
         indel   8490    0.630%
         short repeat    18778   1.394%
         duplication     34148   2.535%
         copy number loss        40441   3.002%
         copy number gain        40671   3.019%
         deletion        77325   5.740%
         single nucleotide variant       1118033 82.997%
 
     """
-    if allType=="single nucleotide variant":
-        return "SNV"
+    allType = allType.lower() # they changed the casing in Sep 2020
+    # doing this first, as this is 80% of the data -> faster
+    if allType in ["single nucleotide variant", "variation"]:
+        # 'variation' doesn't fit anywhere, but we think that these are mostly OMIM imports, so SNV sounds OK
+        return "SUBST"
+    # the order here matches Ana's coloring table for easier reading
+    elif allType in ["translocation", "fusion", "complex"]:
+        return "STRUCT"
     elif allType in ["deletion", "copy number loss"]:
-        return "DEL"
+        return "LOSS"
     elif allType in ["duplication", "copy number gain", "tandem duplication"]:
-        return "DUPL"
-    elif allType=="indel":
-        return "INDEL"
-    elif allType in ["insertion", "NT expansion", "short repeat"]:
+        return "GAIN"
+    elif allType in ["indel", "insertion"]:
         return "INS"
-    elif allType in ["Translocation", "fusion", "complex", "protein only", "inversion", "undetermined variant"]:
-        return "OTH"
+    elif allType in ["inversion"]:
+        return "INV"
+    elif allType in ["undetermined variant"]:
+        return "SEQALT"
+    elif allType in ["short repeat", "nt expansion", "microsatellite"]:
+        return "SEQLEN"
+    #elif allType in ["variation"]: # appeared in Sep 2020
+        #return "OTH"
+    #elif allType in ["protein only"]: # looks like they removed protein only sometimes in early 2020
+        #return "OTH"
     else:
-        print("found unknown allele type: "+allType)
+        print("found unknown allele type '%s' for accession %s"%(allType, debugId))
         assert(False) # found new allele type in ClinVar table : code needs fixing
 
 def originSimpleToCode(originSimple):
     """
     for filtering, group the simplified origins into simpler origin codes:
     return one of these values:
         "GERM" = germline
         "SOM" = somatic
         "GERMSOM" = germline/somatic
         "NOVO" = de novo
         "UNK" = unknown
 
     as of 3/2020, these are the values in originSimple and how often they appear:
 
     tested-inconclusive     91      0.007%
@@ -203,42 +315,47 @@
     no interpretation for the single variant        564     0.092%
     other   1687    0.274%
     Pathogenic/Likely pathogenic    5209    0.846%
     not provided    8860    1.440%
     Benign/Likely benign    19489   3.167%
     Likely pathogenic       30200   4.907%
     Conflicting interpretations of pathogenicity    30910   5.023%
     Pathogenic      66679   10.835%
     Benign  77551   12.602%
     Likely benign   154870  25.166%
     Uncertain significance  217838  35.399%
 
     """
     pathoStr = pathoStr.lower()
     # don't change the order of these unless you tested it. The order is crucial.
+    # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic,
+    # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same 
+    # time, we rather classify it as pathogenic
     if "conflicting" in pathoStr:
         return "CF"
+    if "uncertain" in pathoStr:
+        return "VUS"
+    if "risk factor" in pathoStr:
+        return "RF"
     if "likely pathogenic" in pathoStr:
         return "LP"
+    if "pathogenic" in pathoStr:
+        return "P"
     if "likely benign" in pathoStr:
         return "LB"
-    if "uncertain" in pathoStr:
-        return "UC"
-    if "pathogenic" in pathoStr:
-        return "PG"
     if "benign" in pathoStr:
-        return "BN"
+        return "B"
     return "OT"
 
 def lastMonth(d,x=1):
     """ returns same day as this month, but last month, e.g. for 2013-01-01 
     return 2012-12-01 
     """
 
     days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     newmonth = ((( d.month - 1) - x ) % 12 ) + 1
     newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
     if d.day > days_of_month[newmonth-1]:
       newday = days_of_month[newmonth-1]
     else:
       newday = d.day
     return datetime( newyear, newmonth, newday)
@@ -323,51 +440,56 @@
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
 
         fields = line.split("\t")
         varType = fields[4]
         if "coding" in varType and not "LRG" in fields[6]:
             allToHgvs[fields[3]]=(fields[6], fields[8])
     return allToHgvs
 
 def accListToHtml(inStr):
     """
     given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
-    to HTML links to the databases
+    to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
     """
+    dbVarAcc = ""
     newParts = []
     for part in inStr.split(","):
         fields = part.split(":")
         if len(fields)!=2:
             newParts.append(part)
             continue
 
         db, acc = fields
         accForUrl = acc
 
+        if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv":
+            # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv
+            dbVarAcc = acc
+
         if db in dbToUrl:
             if "OMIM" in db:
                 accForUrl = accForUrl.replace(".", "#")
             url = (dbToUrl[db] % accForUrl)
             newPart = "<a target=_blank href='%s'>%s:%s</a>" % (url, db, acc)
         else:
             newPart = part
         newParts.append(newPart)
-    return ", ".join(newParts)
+    return (", ".join(newParts)), dbVarAcc
 
 def reviewStatusToHtmlStars(reviewStatus):
     """
     In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this:
         1-criteria provided, conflicting interpretations
         2-criteria provided, multiple submitters, no conflicts
         1-criteria provided, single submitter
         0-no assertion criteria provided
         0-no assertion for the individual variant
         0-no assertion provided
         4-practice guideline
         3-reviewed by expert panel
 
     given the review status, return an html string with the correct number of black stars
     """
@@ -395,65 +517,89 @@
 
     emptyStarCount = 4-starCount
 
     htmlList = ["&#9733;"]*starCount 
     htmlList.extend( ["&#9734;"]*emptyStarCount )
 
     htmlStr = "".join(htmlList)
     htmlStr += "&nbsp;&nbsp;<small>based on: %s</small>" % reviewStatus
 
     if starCount==1:
         asciiDesc = "%d star" % starCount
     else:
         asciiDesc = "%d stars" % starCount
     return htmlStr, asciiDesc, starCount
 
+def loadSubmissions(summFname):
+    " load the submissions into the Mysql table for the lollipop track details page "
+    # not doing the load+rename strategy yet, I don't think it makes a difference here
+    logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname)
+    fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv")
+    tmpFname = fh.name
+    cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
+    assert(os.system(cmd)==0)
+
+    cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
+    assert(os.system(cmd)==0)
+
+    cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
+    assert(os.system(cmd)==0)
+
+    cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
+    assert(os.system(cmd)==0)
+
 def downloadFromNcbi(skipDownload):
     " download files from NCBI and return triple of filenames "
     today = date.today().isoformat()
     outSuffix = "%s." % today
 
     baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
     url = baseUrl+"variant_summary.txt.gz"
     outFname = "archive/variant_summary.%s.txt.gz" % today
     filename = outFname
 
     # get the new variant allele file
     varUrl = baseUrl+"variation_allele.txt.gz"
     varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
 
     # get the new hgvs allele file
     hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
     hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
 
+    # get the summary file
+    summUrl = baseUrl+"submission_summary.txt.gz"
+    summFname = join(dirname(filename), "submission_summary.%s.txt.gz" % today)
+
     # get the new VCF file
     vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
     vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today)
 
     if not (isfile(vcfFname)):
         logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
         open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())
 
-    if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
+    if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
         open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
         open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
         open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
+        logging.info("Downlading %s to %s" % (summUrl, summFname))
+        open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())
 
-    return filename, varFname, hgvsFname, vcfFname
+    return filename, varFname, hgvsFname, vcfFname, summFname
 
 def parseVcf(vcfFname):
     " parse VCF and return as dict alleleId -> (molConseq) "
     logging.info("Parsing %s" % vcfFname)
     ret = {}
     for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
         # #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
         #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
         if line.startswith("#"):
             continue
         chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
         infoParts = info.split(";")
 
         alleleId = None
         mc = None
@@ -469,45 +615,46 @@
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
-    filename, varFName, hgvsFname, vcfFname = downloadFromNcbi(options.skipDownload)
+    filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
 
 #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
 allToVcf = parseVcf(vcfFname)
 allToHgvs = parseHgvsFile(hgvsFname)
 
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
+# NC_000010.11:g.(?_87925503)_(87965482_?)del
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
 ifh = gzip.open(filename, "rt", encoding="latin_1")
 
 # check the header
 line1 = ifh.readline()
@@ -525,31 +672,31 @@
 
 longCount = 0
 noAssCount = 0
 noMcCount = 0
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
-      submCategories, varId = row
+      submCategories, varId, posVcf, refAllVcf, altAllVcf = row
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
     hgvsCod, hgvsProt =  allToHgvs.get(alleleId, ("", ""))
 
     molConseq = allToVcf.get(int(alleleId))
     if molConseq is None:
         logging.debug("%s has no molConseq" % alleleId)
         noMcCount += 1
         molConseq = ""
 
     if chrom=="" or assembly=="" or assembly=="NCBI36":
@@ -565,35 +712,35 @@
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
 
     if start=="" or end=="":
-        print(("undefined start or end coordinate. record %s"% irvcAcc))
+        print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end))
         continue
 
     if int(start)<0 or int(end)<0:
-        print(("illegal start or end coordinate. record %s"% irvcAcc))
+        print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end))
         continue
 
     # sometimes they seem to inverse their coordinates
     if int(start)>int(end):
         start, end = end, start
 
     if chrom=="chr17" and (end=="217748468" or end=="84062405"):
         print("blacklisted feature")
         continue
 
     # reorder columns for bigBed
     start = str(int(start)-1) # convert to zero-based, half-open
     score = "0"
     strand = "."
     thickStart = start
@@ -602,117 +749,128 @@
     blockCount = "1"
     blockSizes = str(int(end)-int(start))
     blockStarts = "0"
 
     # used until Oct 2017
     # if dbVarAcc.startswith("nsv") or "copy number" in allType:
     # now using just size on genome, see #20268
     # shortening to >= 50bp to bring into line with LOVD and gnomAD
     if (int(end)-int(start))>=50:
         isCnv = True
     else:
         isCnv = False
 
     pathoCode = clinSignToCode(clinSign)
     originCode = originSimpleToCode(originSimple)
-    allTypeCode = allTypeToCode(allType)
+    allTypeCode = allTypeToCode(allType, "|".join(row))
 
     if isCnv:
-        if "gain" in allType or "duplication" in allType or "insertion" in allType:
-            itemRgb = "0,0,255"
-        if "deletion" in allType or "loss" in allType:
-            itemRgb = "255,0,0"
+        typeColors = cnvColors[allTypeCode]
+        #else:
+        #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode))
+        #assert(False)
+        itemRgb = typeColors.get(pathoCode)
+        if itemRgb is None:
+            print("CNV encountered with unknown pathoCode: "+pathoCode)
+            assert(False)
+
     else:
-        if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic
+        if pathoCode in ["P", "LP"]: # pathogenic or likely pathogenic
             itemRgb = "210,0,0" # red
-        elif pathoCode in ["BN", "LB"]: # benign or likely benign
+        elif pathoCode in ["B", "LB"]: # benign or likely benign
             itemRgb = "0,210,0" # green
-        elif pathoCode in ["UC"]: # uncertain
+        elif pathoCode in ["VUS", "RF"]: # uncertain
             itemRgb = "0,0,128" # dark-blue
         elif pathoCode in ["CF"]: # conflicting
             itemRgb = "137,121,212" # light-blue
         elif pathoCode in ["OT"]: # other
             itemRgb = "128,128,128" # grey
         else:
             assert(False)# should never happen
 
     varLen = int(end)-int(start)
 
     geneStr = geneSymbol
     if geneId.isdigit() and geneId!="-1":
         geneStr = geneId+"|"+geneSymbol
     if len(geneStr)>250:
         geneStr = "not shown, too many genes"
 
     mouseOverName = name
 
-    if name=="":
+    otherIds, dbVarSsvAcc = accListToHtml(otherIds)
+
+    if isCnv and dbVarSsvAcc!="":
+        shortName = dbVarSsvAcc
+    elif name=="":
         name = "No display name was provided by ClinVar for this variant"
         shortName = "NoName"
         mouseOverName = ""
 
     if len(name)>90:
         name = name[:90]+"..."
     name = varId+"|"+name
 
     if len(mouseOverName)>80:
         mouseOverName = mouseOverName[:80]+"..."
 
     if len(hgvsProt) > 90:
         hgvsProt = hgvsProt[:90]+"..."
     if len(hgvsCod) > 90:
         hgvsCod = hgvsCod[:90]+"..."
 
-    otherIds = accListToHtml(otherIds)
-
-    phenotypeIds = accListToHtml(phenotypeIds)
+    phenotypeIds, _ = accListToHtml(phenotypeIds)
 
     starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
 
     mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign,
             "Origin: "+origin, "Phenotypes: "+phenotypeList]
     # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
     mouseOver = ", ".join(mouseOverParts)
 
     row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr, molConseq,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
         hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
         # these fields are the for the filters, not for the display
-        pathoCode, originCode, allTypeCode, str(varLen), str(starCount)]
+        pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
+        # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
+        varId, dbVarSsvAcc]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
              newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":
         ofh = hg19Bed
         if isCnv:
             ofh = hg19BedCnv
     elif assembly=="GRCh38":
         ofh = hg38Bed
         if isCnv:
             ofh = hg38BedCnv
     else:
         noAssCount +=1
         
+    #for o in row:
+        #print(o, type(o))
     ofh.write("\t".join(row))
     ofh.write("\n")
 
 hg19Bed.close()
 hg38Bed.close()
 hg19BedCnv.close()
 hg38BedCnv.close()
 
 logging.info("%d lines with feature name that was too long, shortened them" % longCount)
 logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
 for fname in fnames:
@@ -726,32 +884,34 @@
         compareWithLastMonth(fname, options.maxDiff)
 
 # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
 tmpFnames = []
 for fname in fnames:
     parts = fname.split(".")
     if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
         base, db, date, ext = parts
     else:
         base, db, ext = parts
         
     outFname = "%s.%s.bb" % (base, db)
     tmpFname = "%s.%s.temp.bb" % (base, db)
 
     outFname = basename(outFname) # strip the archive/ part, put into current dir
-    cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
+    cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
     mustRun(cmd)
 
     tableName = outFname.split('.')[0] # e.g. clinvarMain
 
     tmpFnames.append( (db, tableName, tmpFname, outFname) )
 
 # make it atomic by renaming the temp files to the final file names at the very end
 for db, tableName, tmpFname, finalFname in tmpFnames:
     os.rename(tmpFname, finalFname)
 
 # make sure that the "Data last updated" in the table browser is correct
 for db, tableName, tmpFname, finalFname in tmpFnames:
     cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName)
     mustRun(cmd)
 
+loadSubmissions(summFname)
+
 print("ClinVar update end: OK")