src/hg/utils/otto/clinvar/clinVarToBed 61dccdb3f1405aeee1c397b5247e673cab010911

61dccdb3f1405aeee1c397b5247e673cab010911
max
  Thu Dec 14 05:02:54 2023 -0800
adding version.txt files to clinvar. Also starting work on the --alpha
switch, but not tested yet

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 85c298c..188ac7c 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,993 +1,1036 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
 import tempfile, json
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
+if os.getlogin()!="otto":
+    print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
+    print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
+    print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
+    sys.exit(1)
+
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "MONDO" : "https://monarchinitiative.org/disease/%s",
     "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
 # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file
 # if they add a new one, this script must fail and tdb must be updated.
-possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""])
+possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", "initiator codon variant", ""])
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
 Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb")
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
 parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
 # ==== FUNCTIONs =====
 
 # benign = BN, likely benign = LB, conflicting = CF, likely pathogenic= LP, 
 # pathogenic = PG, other = OT, uncertain = VUS, RF=risk factor
 # colors were defined by Ana Benet Pages
 # - WARNING ON FILTER CHANGES:  -  see #28562
 # Never change these codes. They are in old sessions. If you change them, change the across the whole script
 # and also change the name of the .as field, so the cart variables will be different. You can add new values through:
 cnvColors = {
         "INS" : {
             "OT" : "225,165,0",
             "BN" : "250,214,69",
             "LB" : "250,214,69",
             "CF" : "225,165,0",
             "RF" : "225,165,0",
             "VUS" : "225,165,0",
             "LP" : "199,114,3",
             "PG" : "199,114,3"
             },
         "LOSS" : {
             "OT" : "255,0,0",
             "BN" : "255,98,119",
             "LB" : "255,98,119",
             "CF" : "255,0,0",
             "VUS" : "255,0,0",
             "RF" : "255,0,0",
             "LP" : "153,0,0",
             "PG" : "153,0,0"
             },
         "GAIN" : {
             "OT" : "0,0,225",
             "BN" : "77,184,255",
             "LB" : "77,184,255",
             "CF" : "0,0,225",
             "VUS" : "0,0,225",
             "RF" : "0,0,225",
             "LP" : "0,0,179",
             "PG" : "0,0,179"
             },
         "STRUCT" : {
             "OT" : "0,210,0",
             "BN" : "0,255,0",
             "LB" : "0,255,0",
             "CF" : "0,210,0",
             "VUS" : "0,210,0",
             "RF" : "0,210,0",
             "LP" : "0,128,0",
             "PG" : "0,128,0"
             },
         "INV" : {
             "OT" : "192,0,192",
             "BN" : "255,128,255",
             "LB" : "255,128,255",
             "CF" : "192,0,192",
             "VUS" : "192,0,192",
             "RF" : "192,0,192",
             "LP" : "128,0,128",
             "PG" : "128,0,128"
             },
         "SEQALT" : {
             "OT" : "128,128,128",
             "BN" : "191,191,191",
             "LB" : "191,191,191",
             "CF" : "128,128,128",
             "VUS" : "128,128,128",
             "RF" : "128,128,128",
             "LP" : "64,64,64",
             "PG" : "64,64,64"
             },
         "SEQLEN" : {
             "OT" : "0,179,179",
             "BN" : "0,255,255",
             "LB" : "0,255,255",
             "CF" : "0,179,179",
             "VUS" : "0,179,179",
             "RF" : "0,179,179",
             "LP" : "0,102,102",
             "PG" : "0,102,102"
             },
         "SUBST" : {
             "OT" : "128,128,128",
             "BN" : "191,191,191",
             "LB" : "191,191,191",
             "CF" : "128,128,128",
             "VUS" : "128,128,128",
             "RF" : "128,128,128",
             "LP" : "64,64,64",
             "PG" : "64,64,64"
             },
 }
 
 def mustRun(cmd):
     logging.debug(cmd)
     ret = os.system(cmd)
     if ret!=0:
         print(("Could not run: %s" % cmd))
         sys.exit(1)
 
 def shortenName(name):
     "make the clinvar name shorter and keep a long version for the mouse over "
     cnvMatch = cnvRe.match(name)
     #hgvsMatch = hgvsRe.match(name)
 
     if ":" in name and not "chr" in name:
         longName = name.split(":")[0]
     else:
         longName = name
 
     shortName = name
 
     if cnvMatch != None:
         # some names include the original assembly as a prefix
         band, copy = cnvMatch.groups()
         if copy == None:
             copy = ""
         shortName = band+copy
     elif name.startswith("GRCh"):
         # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
         print(name)
         assert False # name that starts with GRCh but does not match our regex should not happen
     elif "." in name:
         # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del)
         # so we remove the gene and the protein description
         if ":" in name:
             name = name.split(":")[1]
             longName = name.split(" ")[0]
 
         # handle genomic HGVS NC_000010.11:g.(?_87925503)_(87965482_?)del
         if name.startswith("NC_") and "g." in name and ")" in name:
             shortName = name
         else:
             # strip away the final description like " (p.Ala184del)"
             name = name.split(" ")[0].split("[")[0].split("(")[0]
             shortName = name
 
         if name.endswith("del"):
             shortName = "del"
         elif name.endswith("ins"):
             shortName = "ins"
         elif name.endswith("dup"):
             shortName = "dup"
         else:
             posMatch = posRe.match(name)
             if posMatch!=None:
                 pos, dupDelIns, change = posMatch.groups()
                 if dupDelIns==None:
                     dupDelIns= ""
 
                 if change==None or change=="":
                     shortName = pos+dupDelIns
                 else:
                     shortName = dupDelIns+change
 
     return shortName, longName
 
 def allTypeToCode(allType, debugId):
     """
     Map the allele type field from ClinVal to one of these classes for filtering:
         STRUCT = structual alteration
         LOSS = deletion, copy number loss, mobile element deletion
         GAIN = duplication, tandem dupl, intrachrom transp, copy number gain
         INS = insertion, mobile or novel element insertion
         INV = inversion
         SEQALT = catch-all category: multiallele cnv, undetermined, others, not defined
         SEQLEN = short tandem repeat, short repeat, rep exp, rep contraction, NT expansion
         SUBST = SNV, MNV, multi nucl polymophrism, indel
         OTH = protein only
 
         # categories were defined by Ana Benet Pages
         # see table at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing
 
     In 3/2020, these were all possible values:
 
         fusion  6       0.000%
         complex 61      0.005%
         protein only    103     0.008%
         NT expansion    142     0.011%
         Translocation   302     0.022%
         inversion       583     0.043%
         undetermined variant    866     0.064%
         insertion       7133    0.530%
         indel   8490    0.630%
         short repeat    18778   1.394%
         duplication     34148   2.535%
         copy number loss        40441   3.002%
         copy number gain        40671   3.019%
         deletion        77325   5.740%
         single nucleotide variant       1118033 82.997%
 
     """
     allType = allType.lower() # they changed the casing in Sep 2020
     # doing this first, as this is 80% of the data -> faster
     if allType in ["single nucleotide variant", "variation"]:
         # 'variation' doesn't fit anywhere, but we think that these are mostly OMIM imports, so SNV sounds OK
         return "SUBST"
     # the order here matches Ana's coloring table for easier reading
     elif allType in ["translocation", "fusion", "complex"]:
         return "STRUCT"
     elif allType in ["deletion", "copy number loss"]:
         return "LOSS"
     elif allType in ["duplication", "copy number gain", "tandem duplication"]:
         return "GAIN"
     elif allType in ["indel", "insertion"]:
         return "INS"
     elif allType in ["inversion"]:
         return "INV"
     elif allType in ["undetermined variant"]:
         return "SEQALT"
     elif allType in ["short repeat", "nt expansion", "microsatellite"]:
         return "SEQLEN"
     #elif allType in ["variation"]: # appeared in Sep 2020
         #return "OTH"
     #elif allType in ["protein only"]: # looks like they removed protein only sometimes in early 2020
         #return "OTH"
     else:
         print("found unknown allele type '%s' for accession %s"%(allType, debugId))
         assert(False) # found new allele type in ClinVar table : code needs fixing
 
 def originSimpleToCode(originSimple):
     """
     for filtering, group the simplified origins into simpler origin codes:
     return one of these values:
         "GERM" = germline
         "SOM" = somatic
         "GERMSOM" = germline/somatic
         "NOVO" = de novo
         "UNK" = unknown
 
     as of 3/2020, these are the values in originSimple and how often they appear:
 
     tested-inconclusive     91      0.007%
     not applicable  193     0.014%
     germline/somatic        1808    0.134%
     somatic 7929    0.589%
     not provided    53173   3.947%
     unknown 72601   5.389%
     germline        1211288 89.919%
     """
     if originSimple=="germline":
         return "GERM"
     elif originSimple=="germline/somatic":
         return "GERMSOM"
     elif originSimple in ["not applicable", "not provided", "unknown", "tested-inconclusive"]:
         return "UNK"
     elif originSimple in ["not applicable", "not provided", "unknown"]:
         return "UNK"
     elif originSimple in ["somatic"]:
         return "SOM"
     else:
         print("found new origin string: "+originSimple)
         assert(False) # Clinvar has added a new origin string
 
 def clinSignToCode(pathoStr):
     """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes:
     - benign = BN
     likely benign = LB
     conflicting = CF
     likely pathogenic= LP
     pathogenic = PG
     other = OT
     uncertain = UC
 
     examples:
 
     Benign/Likely benign,other      13      0.002%
     Conflicting interpretations of pathogenicity,risk factor        13      0.002%
     Likely benign,other     18      0.003%  
     Pathogenic,drug response        28      0.005%
     protective      32      0.005%
     Pathogenic,risk factor  33      0.005%
     Pathogenic,other        102     0.017%
     Affects 120     0.019%
     association     202     0.033%
     drug response   387     0.063%
     risk factor     444     0.072%
     no interpretation for the single variant        564     0.092%
     other   1687    0.274%
     Pathogenic/Likely pathogenic    5209    0.846%
     not provided    8860    1.440%
     Benign/Likely benign    19489   3.167%
     Likely pathogenic       30200   4.907%
     Conflicting interpretations of pathogenicity    30910   5.023%
     Pathogenic      66679   10.835%
     Benign  77551   12.602%
     Likely benign   154870  25.166%
     Uncertain significance  217838  35.399%
 
     """
     pathoStr = pathoStr.lower()
     # don't change the order of these unless you tested it. The order is crucial.
     # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic,
     # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same 
     # time, we rather classify it as pathogenic
     #
     # Do not changes these codes. See WARNING ON FILTER CHANGES in this script.
     if "conflicting" in pathoStr:
         return "CF"
     if "uncertain" in pathoStr:
         return "VUS"
     if "risk factor" in pathoStr:
         return "RF"
     if "likely pathogenic" in pathoStr:
         return "LP"
     if "pathogenic" in pathoStr:
         return "PG"
     if "likely benign" in pathoStr:
         return "LB"
     if "benign" in pathoStr:
         return "BN"
     return "OT"
 
 def lastMonth(d,x=1):
     """ returns same day as this month, but last month, e.g. for 2013-01-01 
     return 2012-12-01 
     """
 
     days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     newmonth = ((( d.month - 1) - x ) % 12 ) + 1
     newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
     if d.day > days_of_month[newmonth-1]:
       newday = days_of_month[newmonth-1]
     else:
       newday = d.day
     return datetime( newyear, newmonth, newday)
 
 def compareWithLastMonth(fname, maxDiff):
     " go back one month from current date, make sure that linecount diff is at most maxDiff "
     todayStr = date.today().isoformat()
     previousDate = lastMonth(date.today())
     previousDateStr = previousDate.date().isoformat()
     previousFname = fname.replace(todayStr, previousDateStr)
     if not isfile(previousFname):
         print(("Cannot find %s, cannot compare new data with previous month" % previousFname))
         return
         
     previousCount = sum(1 for line in open(previousFname, encoding="utf8"))
     todayCount = sum(1 for line in open(fname, encoding="utf8"))
     percDiff = (todayCount-previousCount)/float(todayCount)
 
     if percDiff < 0 and maxDiff:
         raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
             (previousFname, previousCount, fname, todayCount))
     if percDiff > maxDiff:
         raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
     # see RM 14350 do more than just linecount
     # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
     cmd = "diff %s %s --speed-large-files | grep '^[\>\<]'  | wc -l" % (fname, previousFname)
     diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
     logging.info("%s: %d lines changed" % (fname, diffCount))
     percDiff = float(diffCount)/todayCount
     if percDiff > maxDiff:
         logging.info(cmd)
-        diffFname =  abspath(join("archive", "lastDiff.txt"))
+        diffFname =  abspath(join("downloads", "lastDiff.txt"))
         cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname)
         os.system(cmd) # ret code is not 0, as there are differences, so no need to check
         raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname))
 
 # I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse
 # the other files again
 #def parseVariationAlleleFile(varFname):
 #    """ return dict allele Id -> variant ID """
 #    logging.info("Parsing %s" % varFname)
 #    allToVar = {}
 #    foundHead = False
 #    expHead = "#VariationID	Type	AlleleID	Interpreted"
 #    for line in gzip.open(varFname):
 #        line = line.rstrip("\n")
 #        if line==expHead:
 #            foundHead = True
 #        if line.startswith("#Var") and "Type" in line:
 #            if not line == expHead:
 #                print(("Header of variation_allele.txt.gz has changed again. Code needs fixing"))
 #                print("found:",line)
 #                print("expected:",expHead)
 #            else:
 #                foundHead = True
 #
 #        if line.startswith("#"):
 #            continue
 #        if not foundHead:
 #            raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
 #        fields = line.split("\t")
 #        allToVar[fields[2]]=fields[0]
 #    return allToVar
 
 def parseHgvsFile(hgvsFname):
     " return dict: allele Id -> list of (hgvs coding, hgvs protein) "
     logging.info("Parsing %s" % hgvsFname)
     expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
     hg19Hgvs = defaultdict(list)
     hg38Hgvs = defaultdict(list)
     foundHead = False
     for line in gzip.open(hgvsFname, "rt"):
         line = line.rstrip("\n")
         if line==expHead:
             foundHead = True
         if line.startswith("#Symbol") and "GeneID" in line:
             if not line == expHead:
                 print ("Header of hgvs file has changed again. Code needs fixing. Look at the new fields and decide if we should show them.")
                 print(("found:",line))
                 print(("expected:",expHead))
             else:
                 foundHead = True
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
 
         fields = line.split("\t")
         sym, geneId, varId, alleleId, varType, assembly, nuclExpr, nuclChange, protExpr, protChange, usedForNaming, submitted, onLrg = fields
         #varType = fields[4]
         #if "coding" in varType and not "LRG" in fields[6]:
         #assert("|" not in nuclChange and "|" not in protChange)
         #hgvsRow = "|".join((nuclChange, protChange))
         #if ";" in hgvsRow:
             #print(sym, geneId, varId, hgvsRow)
         # this creates invalid HGVS, but it's the only way for now, until we have real JSON tables
         #hgvsRow = hgvsRow.replace(";", ",")
         #assert(";" not in hgvsRow)
         hgvsRow = (nuclChange, protChange)
 
         if assembly=="GRCh37":
             addTo = [hg19Hgvs]
         elif assembly=="GRCh38":
             addTo = [hg38Hgvs]
         elif assembly=="na": # protein changes have no assembly and must be shown on all assemblies
             addTo = [hg19Hgvs, hg38Hgvs]
         elif assembly=="NCBI36":
             continue
         else:
             print("invalid assembly in HGVS: %s, in %s" % (assembly, line))
             assert(False)
 
         for data in addTo:
             data[alleleId].append( hgvsRow )
 
     return hg19Hgvs, hg38Hgvs
 
 def accListToHtml(inStr):
     """
     given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
     to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later.
     A more recent example is  MONDO:MONDO:0019667, which meant that I had to add the "maxsplit" option.
     """
     dbVarAcc = ""
     newParts = []
     for part in inStr.split(","):
         fields = part.split(":", maxsplit=1)
         if len(fields)!=2:
             newParts.append(part)
             continue
 
         db, acc = fields
         accForUrl = acc
 
         if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv":
             # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv
             dbVarAcc = acc
 
         if db in dbToUrl:
             if "OMIM" in db:
                 accForUrl = accForUrl.replace(".", "#")
             url = (dbToUrl[db] % accForUrl)
             newPart = "<a target=_blank href='%s'>%s:%s</a>" % (url, db, acc)
         else:
             newPart = part
         newParts.append(newPart)
     return (", ".join(newParts)), dbVarAcc
 
 def reviewStatusToHtmlStars(reviewStatus):
     """
     In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this:
         1-criteria provided, conflicting interpretations
         2-criteria provided, multiple submitters, no conflicts
         1-criteria provided, single submitter
         0-no assertion criteria provided
         0-no assertion for the individual variant
         0-no assertion provided
         4-practice guideline
         3-reviewed by expert panel
 
     given the review status, return an html string with the correct number of black stars
     """
     revStatus = reviewStatus.strip().lower().replace(" ", "")
     starCount = None
 
     if revStatus == "nointerpretationforthesinglevariant":
         starCount = 0
     if revStatus=="criteriaprovided,conflictinginterpretations":
         starCount = 1
     if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
         starCount = 2
     if revStatus=="criteriaprovided,singlesubmitter":
         starCount = 1
     if revStatus == "practiceguideline":
         starCount = 4
     if revStatus == "reviewedbyexpertpanel":
         starCount = 3
     if "noassertion" in revStatus or revStatus=="-":
         starCount = 0
 
     if starCount == None:
         print(("Illegal review status: %s" % revStatus))
         assert(False)
 
     emptyStarCount = 4-starCount
 
     htmlList = ["&#9733;"]*starCount 
     htmlList.extend( ["&#9734;"]*emptyStarCount )
 
     htmlStr = "".join(htmlList)
     htmlStr += "&nbsp;&nbsp;<small>based on: %s</small>" % reviewStatus
 
     if starCount==1:
         asciiDesc = "%d star" % starCount
     else:
         asciiDesc = "%d stars" % starCount
     return htmlStr, asciiDesc, starCount
 
 def loadSubmissions(summFname):
     " load the submissions into the Mysql table for the lollipop track details page "
     # not doing the load+rename strategy yet, I don't think it makes a difference here
     logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname)
     fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv")
     tmpFname = fh.name
     cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
+def mostCurrentVersionName():
+    " return name of version on FTP site "
+    cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarFullRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
+    verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
+    #<ReleaseSet Dated="2023-09-03" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" Type="full" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/clinvar_public_1.71.xsd">
+    version = verLine.split(" ")[1].split("=")[1].strip('"')
+    return version
+
 def downloadFromNcbi(skipDownload):
-    " download files from NCBI and return triple of filenames "
+    " download files from NCBI and return list of filenames "
     today = date.today().isoformat()
-    outSuffix = "%s." % today
+    downDir = join("downloads", today)
+    try:
+        os.mkdir(downDir)
+    except FileExistsError:
+        pass
+
+    version = mostCurrentVersionName()
+    logging.info("Version on NCBI server is %s" % version)
+    versionFname = join(downDir, "version.txt")
+    open(versionFname, "w").write(version)
 
     baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
     url = baseUrl+"variant_summary.txt.gz"
-    outFname = "archive/variant_summary.%s.txt.gz" % today
+    outFname = join(downDir, "variant_summary.txt.gz")
     filename = outFname
 
     # get the new variant allele file
     varUrl = baseUrl+"variation_allele.txt.gz"
-    varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
+    varFname = join(downDir, "variation_allele.txt.gz")
 
     # get the new hgvs allele file
     hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
-    hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
+    hgvsFname = join(downDir, "hgvs4variation.txt.gz")
 
     # get the summary file
     summUrl = baseUrl+"submission_summary.txt.gz"
-    summFname = join(dirname(filename), "submission_summary.%s.txt.gz" % today)
+    summFname = join(downDir, "submission_summary.txt.gz")
 
     # get the new VCF file
     vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
-    vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today)
+    vcfFname = join(downDir, "GRCh38.clinvar.vcf.gz")
 
     if not (isfile(vcfFname)):
         logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
         open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())
 
     if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
         open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
         open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
         open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
         logging.info("Downlading %s to %s" % (summUrl, summFname))
         open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())
 
-    return filename, varFname, hgvsFname, vcfFname, summFname
+    return filename, varFname, hgvsFname, vcfFname, summFname, versionFname
 
 def parseVcf(vcfFname):
     " parse VCF and return as dict alleleId -> (molConseq) "
     logging.info("Parsing %s" % vcfFname)
     ret = {}
     for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
         # #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
         #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
         if line.startswith("#"):
             continue
         chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
         infoParts = info.split(";")
 
         alleleId = None
         mc = None
         for info in infoParts:
             key, val = info.split("=")
             if key=="ALLELEID":
                 alleleId = val
             elif key=="MC":
                 mc = val.split("|")[1].split(",")[0].replace("_", " ")
                 if not mc in possMolConseqs:
                     print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
                             "Please update otto script %s" % (mc, __file__))
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
-    filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload)
+    filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
+    versionFname = args[4]
 
 #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
 allToVcf = parseVcf(vcfFname)
 hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname)
 
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
 # NC_000010.11:g.(?_87925503)_(87965482_?)del
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
 ifh = gzip.open(filename, "rt", encoding="latin_1")
 
 # check the header
 line1 = ifh.readline()
 if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
     logging.error("current header line: %s" % line1.replace("\t","|"))
     logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
     raise Exception("code needs fixing")
 
 # open out files
-hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
-hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
-hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
-hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
+hg19Bed = open("bed/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38Bed = open("bed/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1")
+hg19BedCnv = open("bed/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1")
+hg38BedCnv = open("bed/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1")
 
 longCount = 0
 noAssCount = 0
 noMcCount = 0
 minusAccs = []
 
 # convert lines
 for line in ifh:
     line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
     line = line.rstrip("\n")
     row = line.split("\t")
     row = [f.replace("\0", "") for f in row ]
 
     alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
       irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
       refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
       submCategories, varId, posVcf, refAllVcf, altAllVcf = row
 
     # changed in July 2018 when the "varId" was merged into this table. It used to be 
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
     if chrom=="" or assembly=="" or assembly=="NCBI36" or assembly=="na":
         noAssCount += 1
         continue
 
     if assembly=="GRCh37":
         hgvs = hg19Hgvs
     elif assembly=="GRCh38":
         hgvs = hg38Hgvs
     else:
         print("Invalid assembly: %s, row %s" % (repr(assembly), repr(row)))
         assert(False)
 
     #hgvsTable =  ";".join(hgvs[alleleId])
     hgvsTable =  json.dumps(hgvs[alleleId])
 
     molConseq = allToVcf.get(int(alleleId))
     if molConseq is None:
         logging.debug("%s has no molConseq" % alleleId)
         noMcCount += 1
         molConseq = ""
 
     if chromAcc=="NT_187513.1":
         chrom = "chrUn_KI270742v1"
     elif chromAcc=="NT_167222.1":
         chrom = "chrUn_gl000228"
     elif chromAcc=="NT_113889.1":
         if assembly=="GRCh37":
             chrom = "chrUn_gl000218"
         else:
             chrom = "chrUn_GL000218v1"
     elif chrom=="Un":
         print("Unknown fix/alt/patch chrom, please adapt clinVarToBed: %s, row: %s" % (chrom, row))
         sys.exit(1)
     else:
         chrom = "chr"+chrom
 
     if chrom=="chrUn" and assembly=="GRCh38":
         print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
         continue
 
     if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
 
     if start=="-1" and end=="-1":
         minusAccs.append(irvcAcc)
         continue
 
     if start=="" or end=="":
         print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end))
         continue
 
     if int(start)<0 or int(end)<0:
         print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end))
         continue
 
     # sometimes they seem to inverse their coordinates
     if int(start)>int(end):
         start, end = end, start
 
     if chrom=="chr17" and (end=="217748468" or end=="84062405"):
         print("blacklisted feature")
         continue
 
     # reorder columns for bigBed
     start = str(int(start)-1) # convert to zero-based, half-open
     score = "0"
     strand = "."
     thickStart = start
     thickEnd = end
     itemRgb = "0"
     blockCount = "1"
     blockSizes = str(int(end)-int(start))
     blockStarts = "0"
 
     # used until Oct 2017
     # if dbVarAcc.startswith("nsv") or "copy number" in allType:
     # now using just size on genome, see #20268
     # shortening to >= 50bp to bring into line with LOVD and gnomAD
     if (int(end)-int(start))>=50:
         isCnv = True
     else:
         isCnv = False
 
     pathoCode = clinSignToCode(clinSign)
     originCode = originSimpleToCode(originSimple)
     allTypeCode = allTypeToCode(allType, "|".join(row))
 
     if isCnv:
         typeColors = cnvColors[allTypeCode]
         #else:
         #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode))
         #assert(False)
         itemRgb = typeColors.get(pathoCode)
         if itemRgb is None:
             print("CNV encountered with unknown pathoCode: "+pathoCode)
             assert(False)
 
     else:
         if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic
             itemRgb = "210,0,0" # red
         elif pathoCode in ["BN", "LB"]: # benign or likely benign
             itemRgb = "0,210,0" # green
         elif pathoCode in ["VUS", "RF"]: # uncertain
             itemRgb = "0,0,128" # dark-blue
         elif pathoCode in ["CF"]: # conflicting
             itemRgb = "137,121,212" # light-blue
         elif pathoCode in ["OT"]: # other
             itemRgb = "128,128,128" # grey
         else:
             assert(False)# should never happen
 
     varLen = int(end)-int(start)
 
     geneStr = geneSymbol
     if geneId.isdigit() and geneId!="-1":
         geneStr = geneId+"|"+geneSymbol
     if len(geneStr)>250:
         geneStr = "not shown, too many genes"
 
     mouseOverName = name
 
     otherIds, dbVarSsvAcc = accListToHtml(otherIds)
 
     if isCnv and dbVarSsvAcc!="":
         shortName = dbVarSsvAcc
     elif name=="":
         name = "No display name was provided by ClinVar for this variant"
         shortName = "NoName"
         mouseOverName = ""
 
     if len(name)>90:
         name = name[:90]+"..."
 
     vcvId = "VCV%09d" % int(varId)
     name = varId+"|"+vcvId+" : "+name
 
     if len(mouseOverName)>80:
         mouseOverName = mouseOverName[:80]+"..."
 
     #if len(hgvsProt) > 90:
         #hgvsProt = hgvsProt[:90]+"..."
     #if len(hgvsCod) > 90:
         #hgvsCod = hgvsCod[:90]+"..."
 
     phenotypeIds, _ = accListToHtml(phenotypeIds)
 
     starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
 
     phenotypeList = ", ".join(phenotypeList.split("|"))
 
     mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign,
             "Origin: "+origin, "Phenotypes: "+phenotypeList]
     # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars
     mouseOver = ", ".join(mouseOverParts)
 
     snpAcc = "rs"+snpAcc # dbSnp links changed in mid/late 2022
 
     row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr, molConseq,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
         hgvsTable, "", numberSubmitters, lastEval, guidelines, otherIds, mouseOver,
         # these fields are the for the filters, not for the display
         pathoCode, originCode, allTypeCode, str(varLen), str(starCount),
         # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field
         varId, dbVarSsvAcc]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
              newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":
         ofh = hg19Bed
         if isCnv:
             ofh = hg19BedCnv
     elif assembly=="GRCh38":
         ofh = hg38Bed
         if isCnv:
             ofh = hg38BedCnv
     else:
         noAssCount +=1
         
     ofh.write("\t".join(row))
     ofh.write("\n")
 
 hg19Bed.close()
 hg38Bed.close()
 hg19BedCnv.close()
 hg38BedCnv.close()
 
 logging.info("%d lines with feature name that was too long, shortened them" % longCount)
 logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
 logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
 for fname in fnames:
     cmd = "bedSort %s %s" % (fname, fname)
     logging.debug(cmd)
     assert(os.system(cmd)==0)
 
 # check new bed files against last months's bed files
 if options.maxDiff:
     for fname in fnames:
         compareWithLastMonth(fname, options.maxDiff)
 
+bigBedDir = "bigBed"
+if options.isAlpha:
+    bigBedDir = "bigBedAlpha"
+
 # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
 tmpFnames = []
 for fname in fnames:
     parts = fname.split(".")
     if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
         base, db, date, ext = parts
     else:
         base, db, ext = parts
         
-    outFname = "%s.%s.bb" % (base, db)
     tmpFname = "%s.%s.temp.bb" % (base, db)
+    outFname = "%s/%s.%s.bb" % (bigBedDir, base, db)
 
     outFname = basename(outFname) # strip the archive/ part, put into current dir
     cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
     mustRun(cmd)
 
     tableName = outFname.split('.')[0] # e.g. clinvarMain
 
     tmpFnames.append( (db, tableName, tmpFname, outFname) )
 
 # make it atomic by renaming the temp files to the final file names at the very end
 for db, tableName, tmpFname, finalFname in tmpFnames:
     os.rename(tmpFname, finalFname)
 
-# make sure that the "Data last updated" in the table browser is correct
-monYear = date.today().strftime("%Y-%m")
+# create the version.txt file
+clinvarVersion = open(versionFname).read()
+versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d"))
+outFname = join(bigBedDir, "version.txt")
+with open(outFname, "w") as ofh:
+    ofh.write(versionString)
+
+if not options.isAlpha:
     for db, tableName, tmpFname, finalFname in tmpFnames:
-    gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
-    cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
-    mustRun(cmd)
+        # not doing this anymore, using bigDataUrl now in trackDb
+        #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
+        #mustRun(cmd)
 
+        yearMonDay = date.today().strftime("%Y-%m-%d")
+        gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
         # put a copy into the archive
-    outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, monYear)
+        outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
         if not isdir(outDir):
             os.mkdir(outDir)
         archiveFname = join(outDir, basename(gbdbFname))
         logging.info("Creating %s" % archiveFname)
         cmd = "cp %s %s" % (finalFname, archiveFname)
         mustRun(cmd)
 
+    # add the version.txt files
+    for db in ["hg38", "hg19"]:
+        archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
+        cmd = "cp %s/version.txt %s" % (bigBedDir, archDir)
+        mustRun(cmd)
+
 loadSubmissions(summFname)
 
 print("ClinVar update end: OK")