e95d24a1c7ced8663574b8f90aa255b897c3c423
max
Fri Jan 31 05:05:55 2020 -0800
adding a few of our brand new bigBed filters to the clinvar track, refs #24850
diff --git src/hg/utils/otto/clinVarToBed src/hg/utils/otto/clinVarToBed
new file mode 100755
index 0000000..ee04313
--- /dev/null
+++ src/hg/utils/otto/clinVarToBed
@@ -0,0 +1,610 @@
+#!/usr/bin/env python3
+
+import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
+from collections import defaultdict
+from os.path import join, basename, dirname, isfile, abspath
+from datetime import date, datetime, timedelta
+
+dbToUrl = {
+ "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
+ "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
+ "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
+ "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
+ "OMIM" : "http://www.omim.org/entry/%s",
+ "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
+}
+
+# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
+parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed
+
+Output goes into the current dir
+
+input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz
+""")
+
+parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
+parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
+parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
+(options, args) = parser.parse_args()
+
+if options.debug:
+ logging.basicConfig(level=logging.DEBUG)
+else:
+ logging.basicConfig(level=logging.INFO)
+
+clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID\n"
+# ==== FUNCTIONs =====
+
+def mustRun(cmd):
+ logging.debug(cmd)
+ ret = os.system(cmd)
+ if ret!=0:
+ print(("Could not run: %s" % cmd))
+ sys.exit(1)
+
+def shortenName(name):
+ "make the clinvar name shorter and keep a long version for the mouse over "
+ cnvMatch = cnvRe.match(name)
+ #hgvsMatch = hgvsRe.match(name)
+
+ if ":" in name and not "chr" in name:
+ longName = name.split(":")[0]
+ else:
+ longName = name
+
+ shortName = name
+
+ if cnvMatch != None:
+ # some names include the original assembly as a prefix
+ band, copy = cnvMatch.groups()
+ if copy == None:
+ copy = ""
+ shortName = band+copy
+ elif name.startswith("GRCh"):
+ # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
+ print(name)
+ assert False # name that starts with GRCh but does not match our regex should not happen
+ elif "." in name:
+ # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del)
+ # so we remove the gene and the protein description
+ if ":" in name:
+ name = name.split(":")[1]
+ longName = name.split(" ")[0]
+ name = name.split(" ")[0].split("[")[0].split("(")[0]
+ shortName = name
+
+ if name.endswith("del"):
+ shortName = "del"
+ elif name.endswith("ins"):
+ shortName = "ins"
+ elif name.endswith("dup"):
+ shortName = "dup"
+ else:
+ posMatch = posRe.match(name)
+ if posMatch!=None:
+ pos, dupDelIns, change = posMatch.groups()
+ if dupDelIns==None:
+ dupDelIns= ""
+
+ if change==None or change=="":
+ shortName = pos+dupDelIns
+ else:
+ shortName = dupDelIns+change
+
+ return shortName, longName
+
+def clinSignToPathoCode(pathoStr):
+ """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes:
+ - benign = BN
+ likely benign = LB
+ conflicting = CF
+ likely pathogenic= LP
+ pathogenic = PG
+ other = OT
+ uncertain = UC
+
+ examples:
+
+ Benign/Likely benign,other 13 0.002%
+ Conflicting interpretations of pathogenicity,risk factor 13 0.002%
+ Likely benign,other 18 0.003%
+ Pathogenic,drug response 28 0.005%
+ protective 32 0.005%
+ Pathogenic,risk factor 33 0.005%
+ Pathogenic,other 102 0.017%
+ Affects 120 0.019%
+ association 202 0.033%
+ drug response 387 0.063%
+ risk factor 444 0.072%
+ no interpretation for the single variant 564 0.092%
+ other 1687 0.274%
+ Pathogenic/Likely pathogenic 5209 0.846%
+ not provided 8860 1.440%
+ Benign/Likely benign 19489 3.167%
+ Likely pathogenic 30200 4.907%
+ Conflicting interpretations of pathogenicity 30910 5.023%
+ Pathogenic 66679 10.835%
+ Benign 77551 12.602%
+ Likely benign 154870 25.166%
+ Uncertain significance 217838 35.399%
+
+ """
+ pathoStr = pathoStr.lower()
+ # don't change the order of these unless you tested it. The order is crucial.
+ if "conflicting" in pathoStr:
+ return "CF"
+ if "likely pathogenic" in pathoStr:
+ return "LP"
+ if "likely benign" in pathoStr:
+ return "LB"
+ if "uncertain" in pathoStr:
+ return "UC"
+ if "pathogenic" in pathoStr:
+ return "PG"
+ if "benign" in pathoStr:
+ return "BN"
+ return "OT"
+
+def lastMonth(d,x=1):
+ """ returns same day as this month, but last month, e.g. for 2013-01-01
+ return 2012-12-01
+ """
+
+ days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
+ newmonth = ((( d.month - 1) - x ) % 12 ) + 1
+ newyear = d.year + ((( d.month - 1) - x ) // 12 )
+ if d.day > days_of_month[newmonth-1]:
+ newday = days_of_month[newmonth-1]
+ else:
+ newday = d.day
+ return datetime( newyear, newmonth, newday)
+
+def compareWithLastMonth(fname, maxDiff):
+ " go back one month from current date, make sure that linecount diff is at most maxDiff "
+ todayStr = date.today().isoformat()
+ previousDate = lastMonth(date.today())
+ previousDateStr = previousDate.date().isoformat()
+ previousFname = fname.replace(todayStr, previousDateStr)
+ if not isfile(previousFname):
+ print(("Cannot find %s, cannot compare new data with previous month" % previousFname))
+ return
+
+ previousCount = sum(1 for line in open(previousFname))
+ todayCount = sum(1 for line in open(fname))
+ percDiff = (todayCount-previousCount)/float(todayCount)
+
+ if percDiff < 0 and maxDiff:
+ raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
+ (previousFname, previousCount, fname, todayCount))
+ if percDiff > maxDiff:
+ raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
+ # see RM 14350 do more than just linecount
+ # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
+ cmd = "diff %s %s --speed-large-files | grep '^[\>\<]' | wc -l" % (fname, previousFname)
+ diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
+ logging.info("%s: %d lines changed" % (fname, diffCount))
+ percDiff = float(diffCount)/todayCount
+ if percDiff > maxDiff:
+ logging.info(cmd)
+ diffFname = abspath(join("archive", "lastDiff.txt"))
+ cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname)
+ os.system(cmd) # ret code is not 0, as there are differences, so no need to check
+ raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname))
+
+# I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse
+# the other files again
+#def parseVariationAlleleFile(varFname):
+# """ return dict allele Id -> variant ID """
+# logging.info("Parsing %s" % varFname)
+# allToVar = {}
+# foundHead = False
+# expHead = "#VariationID Type AlleleID Interpreted"
+# for line in gzip.open(varFname):
+# line = line.rstrip("\n")
+# if line==expHead:
+# foundHead = True
+# if line.startswith("#Var") and "Type" in line:
+# if not line == expHead:
+# print(("Header of variation_allele.txt.gz has changed again. Code needs fixing"))
+# print("found:",line)
+# print("expected:",expHead)
+# else:
+# foundHead = True
+#
+# if line.startswith("#"):
+# continue
+# if not foundHead:
+# raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
+# fields = line.split("\t")
+# allToVar[fields[2]]=fields[0]
+# return allToVar
+
+def parseHgvsFile(hgvsFname):
+ " return dict: allele Id -> (hgvs coding, hgvs protein) "
+ logging.info("Parsing %s" % hgvsFname)
+ expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene"
+ allToHgvs = {}
+ foundHead = False
+ for line in gzip.open(hgvsFname, "rt"):
+ line = line.rstrip("\n")
+ if line==expHead:
+ foundHead = True
+ if line.startswith("#Symbol") and "GeneID" in line:
+ if not line == expHead:
+ print ("Header of hgvs file has changed again. Code needs fixing")
+ print(("found:",line))
+ print(("expected:",expHead))
+ else:
+ foundHead = True
+
+ if line.startswith("#"):
+ continue
+ if not foundHead:
+ raise Exception("Invalid Header. Code needs fixing, again.")
+
+ fields = line.split("\t")
+ varType = fields[4]
+ if "coding" in varType and not "LRG" in fields[6]:
+ allToHgvs[fields[3]]=(fields[6], fields[8])
+ return allToHgvs
+
+def accListToHtml(inStr):
+ """
+ given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions
+ to HTML links to the databases
+ """
+ newParts = []
+ for part in inStr.split(","):
+ fields = part.split(":")
+ if len(fields)!=2:
+ newParts.append(part)
+ continue
+
+ db, acc = fields
+ accForUrl = acc
+
+ if db in dbToUrl:
+ if "OMIM" in db:
+ accForUrl = accForUrl.replace(".", "#")
+ url = (dbToUrl[db] % accForUrl)
+ newPart = "%s:%s" % (url, db, acc)
+ else:
+ newPart = part
+ newParts.append(newPart)
+ return ", ".join(newParts)
+
+def reviewStatusToHtmlStars(reviewStatus):
+ """
+ In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this:
+ 1-criteria provided, conflicting interpretations
+ 2-criteria provided, multiple submitters, no conflicts
+ 1-criteria provided, single submitter
+ 0-no assertion criteria provided
+ 0-no assertion for the individual variant
+ 0-no assertion provided
+ 4-practice guideline
+ 3-reviewed by expert panel
+
+ given the review status, return an html string with the correct number of black stars
+ """
+ revStatus = reviewStatus.strip().lower().replace(" ", "")
+ starCount = None
+
+ if revStatus == "nointerpretationforthesinglevariant":
+ starCount = 0
+ if revStatus=="criteriaprovided,conflictinginterpretations":
+ starCount = 1
+ if revStatus=="criteriaprovided,multiplesubmitters,noconflicts":
+ starCount = 2
+ if revStatus=="criteriaprovided,singlesubmitter":
+ starCount = 1
+ if revStatus == "practiceguideline":
+ starCount = 4
+ if revStatus == "reviewedbyexpertpanel":
+ starCount = 3
+ if "noassertion" in revStatus or revStatus=="-":
+ starCount = 0
+
+ if starCount == None:
+ print(("Illegal review status: %s" % revStatus))
+ assert(False)
+
+ emptyStarCount = 4-starCount
+
+ htmlList = ["★"]*starCount
+ htmlList.extend( ["☆"]*emptyStarCount )
+
+ htmlStr = "".join(htmlList)
+ htmlStr += " based on: %s" % reviewStatus
+
+ asciiDesc = "%d stars" % starCount
+ return htmlStr, asciiDesc, starCount
+
+def downloadFromNcbi(skipDownload):
+ " download files from NCBI and return triple of filenames "
+ today = date.today().isoformat()
+ outSuffix = "%s." % today
+
+ url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
+ outFname = "archive/variant_summary.%s.txt.gz" % today
+ filename = outFname
+
+ # get the new variant allele file
+ varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz"
+ varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
+
+ # get the new hgvs allele file
+ hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz"
+ hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
+
+ if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
+ logging.info("Downlading %s to %s" % (url, outFname))
+ open(outFname, "wb").write(urllib.request.urlopen(url).read())
+ logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
+ open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
+ logging.info("Downlading %s to %s" % (varUrl, varFname))
+ open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
+ return filename, varFname, hgvsFname
+
+# ----------- MAIN --------------
+if args==[] and not options.auto:
+ parser.print_help()
+ exit(1)
+
+outSuffix = ""
+
+if options.auto:
+ filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload)
+else:
+ filename = args[0]
+ varFname = args[1]
+ hgvsFname = args[2]
+
+#allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
+allToHgvs = parseHgvsFile(hgvsFname)
+
+# e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
+# GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
+# GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
+# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
+cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
+
+hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
+
+# e.g.
+# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
+# c.80A>T
+# c.740+356C>T
+# c.3839_3855del17
+posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
+
+ifh = gzip.open(filename, "rt")
+
+# check the header
+line1 = ifh.readline()
+if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
+ logging.error("ClinVar has changed their header line again")
+ logging.error("current header line: %s" % line1.replace("\t","|"))
+ logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
+ raise Exception("code needs fixing")
+
+# open out files
+hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w")
+hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w")
+hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w")
+hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w")
+
+longCount = 0
+noAssCount = 0
+
+# convert lines
+for line in ifh:
+ line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions
+ line = line.rstrip("\n")
+ row = line.split("\t")
+ row = [f.replace("\0", "") for f in row ]
+
+ alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \
+ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \
+ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \
+ submCategories, varId = row
+
+ # changed in July 2018 when the "varId" was merged into this table. It used to be
+ # in allToVar
+ if varId=="":
+ logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
+
+ hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", ""))
+
+ if chrom=="" or assembly=="" or assembly=="NCBI36":
+ noAssCount += 1
+ continue
+
+ if chrom=="Un" and assembly=="GRCh38":
+ print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
+ continue
+ chrom = "chr"+chrom
+
+ if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
+ if assembly=="GRCh37":
+ chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020
+ else:
+ chrom = "chrM"
+
+ shortName, longName = shortenName(name)
+
+ if len(shortName)>20:
+ shortName = shortName[:17]+"..."
+ longCount+=1
+
+ if len(longName)>60:
+ longName = longName[:60]+"..."
+
+ if start=="" or end=="":
+ print(("undefined start or end coordinate. record %s"% irvcAcc))
+ continue
+
+ if int(start)<0 or int(end)<0:
+ print(("illegal start or end coordinate. record %s"% irvcAcc))
+ continue
+
+ # sometimes they seem to inverse their coordinates
+ if int(start)>int(end):
+ start, end = end, start
+
+ if chrom=="chr17" and (end=="217748468" or end=="84062405"):
+ print("blacklisted feature")
+ continue
+
+ # reorder columns for bigBed
+ start = str(int(start)-1) # convert to zero-based, half-open
+ score = "0"
+ strand = "."
+ thickStart = start
+ thickEnd = end
+ itemRgb = "0"
+ blockCount = "1"
+ blockSizes = str(int(end)-int(start))
+ blockStarts = "0"
+
+ # used until Oct 2017
+ # if dbVarAcc.startswith("nsv") or "copy number" in allType:
+ # now using just size on genome, see #20268
+ if (int(end)-int(start))>100:
+ isCnv = True
+ else:
+ isCnv = False
+
+ if isCnv:
+ if "gain" in allType or "duplication" in allType:
+ itemRgb = "0,0,255"
+ if "deletion" in allType or "loss" in allType:
+ itemRgb = "255,0,0"
+ else:
+ isPat, isBen = False, False
+ if "pathogenic" in clinSign.lower():
+ isPat = True
+ if "benign" in clinSign.lower():
+ isBen = True
+
+ if (isPat==True and isBen==True) or (isPat==False and isBen==False):
+ itemRgb = "128,128,128"
+ elif isPat==True:
+ itemRgb = "210,0,0"
+ elif isBen==True:
+ itemRgb = "0,210,0"
+ else:
+ assert(False)# should never happen
+
+ pathoCode = clinSignToPathoCode(clinSign)
+ varLen = int(end)-int(start)
+
+ geneStr = geneSymbol
+ if geneId.isdigit() and geneId!="-1":
+ geneStr = geneId+"|"+geneSymbol
+ if len(geneStr)>250:
+ geneStr = "not shown, too many genes"
+
+ if name=="":
+ name = "No display name was provided by ClinVar for this variant"
+ shortName = "NoName"
+
+ if len(name)>90:
+ name = name[:90]+"..."
+ name = varId+"|"+name
+
+ if len(hgvsProt) > 90:
+ hgvsProt = hgvsProt[:90]+"..."
+ if len(hgvsCod) > 90:
+ hgvsCod = hgvsCod[:90]+"..."
+
+ otherIds = accListToHtml(otherIds)
+
+ phenotypeIds = accListToHtml(phenotypeIds)
+
+ starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
+
+ mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"]
+ mouseOver = ", ".join(mouseOverParts)
+
+ row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
+ blockCount, blockSizes, blockStarts,
+ name, clinSign, starRatingHtml, allType, geneStr,
+ snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
+ hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)]
+
+ # replace clinvar's placeholders with real empty fields
+ newRow = []
+ for x in row:
+ if x in ["-1", "-"]:
+ newRow.append("")
+ else:
+ newRow.append(x)
+ row = newRow
+
+ if assembly=="GRCh37":
+ ofh = hg19Bed
+ if isCnv:
+ ofh = hg19BedCnv
+ elif assembly=="GRCh38":
+ ofh = hg38Bed
+ if isCnv:
+ ofh = hg38BedCnv
+ else:
+ noAssCount +=1
+
+ ofh.write("\t".join(row))
+ ofh.write("\n")
+
+hg19Bed.close()
+hg38Bed.close()
+hg19BedCnv.close()
+hg38BedCnv.close()
+
+logging.info("%d lines with feature name that was too long, shortened them" % longCount)
+logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
+
+fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name]
+
+# sort the files
+for fname in fnames:
+ cmd = "bedSort %s %s" % (fname, fname)
+ logging.debug(cmd)
+ assert(os.system(cmd)==0)
+
+# check new bed files against last months's bed files
+if options.maxDiff:
+ for fname in fnames:
+ compareWithLastMonth(fname, options.maxDiff)
+
+# convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
+tmpFnames = []
+for fname in fnames:
+ parts = fname.split(".")
+ if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
+ base, db, date, ext = parts
+ else:
+ base, db, ext = parts
+
+ outFname = "%s.%s.bb" % (base, db)
+ tmpFname = "%s.%s.temp.bb" % (base, db)
+
+ outFname = basename(outFname) # strip the archive/ part, put into current dir
+ cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname)
+ mustRun(cmd)
+
+ tableName = outFname.split('.')[0] # e.g. clinvarMain
+
+ tmpFnames.append( (db, tableName, tmpFname, outFname) )
+
+# make it atomic by renaming the temp files to the final file names at the very end
+for db, tableName, tmpFname, finalFname in tmpFnames:
+ os.rename(tmpFname, finalFname)
+
+# make sure that the "Data last updated" in the table browser is correct
+for db, tableName, tmpFname, finalFname in tmpFnames:
+ cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName)
+ mustRun(cmd)
+
+print("ClinVar update end: OK")