src/hg/utils/otto/clinvar/clinVarToBed aaec16b62487ea4d0a0c19193428917e095c756b

aaec16b62487ea4d0a0c19193428917e095c756b
jnavarr5
  Tue Nov 25 14:10:47 2025 -0800
Updating the clinVarToBed script to match the one the one that is running on dev, no Redmine

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index a014dc29d7a..8463933a900 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -1,19 +1,19 @@
 #!/usr/bin/env python3
 
 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess
-import tempfile, json
+import tempfile, json, glob
 from collections import defaultdict
 from os.path import join, basename, dirname, isfile, abspath, isdir
 from datetime import date, datetime, timedelta
 
 dbToUrl = {
     "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/",
     "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s",
     "OMIM Allelic Variant" : "http://www.omim.org/entry/%s",
     "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s",
     "OMIM" : "http://www.omim.org/entry/%s",
     "MONDO" : "https://monarchinitiative.org/disease/%s",
     "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s",
     "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s"
 }
 
@@ -27,31 +27,31 @@
 
 # make sure that if the second list is edited, it is never out of sync with the first list
 assert(len(set(truncConseqs - possMolConseqs))==0)
 
 # === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
 parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed.
 
 Output goes into the current dir
 
 Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/
 """) 
 
 parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
 parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb")
 parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed")
-parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing")
+parser.add_option("", "--onlyNew", dest="onlyNew", action="store_true", help="Stop with no output to stdout if there is no new version on the Clinvar website")
 parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.")
 (options, args) = parser.parse_args()
 
 if options.debug:
     logging.basicConfig(level=logging.DEBUG)
 else:
     logging.basicConfig(level=logging.INFO)
 
 if not options.isAlpha and os.getlogin()!="otto":
     print("Please do not run this script as a normal user, you will run into all sorts of permission problems.")
     print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.")
     print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.")
     sys.exit(1)
 
 #clinvarExpectHeaders = "#AlleleID	Type	Name	GeneID	GeneSymbol	HGNC_ID	ClinicalSignificance	ClinSigSimple	LastEvaluated	RS# (dbSNP)	nsv/esv (dbVar)	RCVaccession	PhenotypeIDS	PhenotypeList	Origin	OriginSimple	Assembly	ChromosomeAccession	Chromosome	Start	Stop	ReferenceAllele	AlternateAllele	Cytogenetic	ReviewStatus	NumberSubmitters	Guidelines	TestedInGTR	OtherIDs	SubmitterCategories	VariationID	PositionVCF	ReferenceAlleleVCF	AlternateAlleleVCF\n"
@@ -141,31 +141,31 @@
             "BN" : "191,191,191",
             "LB" : "191,191,191",
             "CF" : "128,128,128",
             "VUS" : "128,128,128",
             "RF" : "128,128,128",
             "LP" : "64,64,64",
             "PG" : "64,64,64"
             },
 }
 
 def mustRun(cmd):
     logging.debug(cmd)
     ret = os.system(cmd)
     if ret!=0:
         print(("Could not run: %s" % cmd))
-        sys.exit(1)
+        raise
 
 def shortenName(name):
     "make the clinvar name shorter and keep a long version for the mouse over "
     cnvMatch = cnvRe.match(name)
     #hgvsMatch = hgvsRe.match(name)
 
     if ":" in name and not "chr" in name:
         longName = name.split(":")[0]
     else:
         longName = name
 
     shortName = name
 
     if cnvMatch != None:
         # some names include the original assembly as a prefix
@@ -372,38 +372,35 @@
 
 def lastMonth(d,x=1):
     """ returns same day as this month, but last month, e.g. for 2013-01-01 
     return 2012-12-01 
     """
 
     days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     newmonth = ((( d.month - 1) - x ) % 12 ) + 1
     newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
     if d.day > days_of_month[newmonth-1]:
       newday = days_of_month[newmonth-1]
     else:
       newday = d.day
     return datetime( newyear, newmonth, newday)
 
-def compareWithLastMonth(fname, maxDiff):
-    " go back one month from current date, make sure that linecount diff is at most maxDiff "
-    todayStr = date.today().isoformat()
-    previousDate = lastMonth(date.today())
-    previousDateStr = previousDate.date().isoformat()
-    previousFname = fname.replace(todayStr, previousDateStr)
+def compareWithPrevRun(fname, maxDiff):
+    " make sure that linecount diff is at most maxDiff "
+    previousFname = "prevRun/"+basename(fname)
     if not isfile(previousFname):
-        print(("Cannot find %s, cannot compare new data with previous month" % previousFname))
+        print(("Cannot find %s, cannot compare new data with previous run" % previousFname))
         return
         
     previousCount = sum(1 for line in open(previousFname, encoding="utf8"))
     todayCount = sum(1 for line in open(fname, encoding="utf8"))
     percDiff = (todayCount-previousCount)/float(todayCount)
 
     if percDiff < 0 and maxDiff:
         raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \
             (previousFname, previousCount, fname, todayCount))
     if percDiff > maxDiff:
         raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount))
     # see RM 14350 do more than just linecount
     # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux
     cmd = "diff %s %s --speed-large-files | grep '^[><]'  | wc -l" % (fname, previousFname)
     diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT))
@@ -601,70 +598,77 @@
     cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname)
     print(cmd)
     assert(os.system(cmd)==0)
 
     cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname)
     assert(os.system(cmd)==0)
 
 def mostCurrentVersionName():
     " return name of version on FTP site "
     cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1'
     verLine = subprocess.check_output(cmd, shell=True, encoding="utf8")
     #<ClinVarVariationRelease xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:noNamespaceSchemaLocation="http://ftp.ncbi.nlm.nih.gov/pub/clinvar/xsd_public/ClinVar_VCV_2.0.xsd" ReleaseDate="2024-03-06">
     version = verLine.split(" ")[3].split("=")[1].split('"')[1]
     return version
 
-def downloadFromNcbi(skipDownload):
-    " download files from NCBI and return list of filenames "
-    today = date.today().isoformat()
-    downDir = join("downloads", today)
+def downloadFromNcbi(onlyNew):
+    " download files from NCBI and return list of filenames. If onlyNew is true, exit with error code 0, no output "
+    clinvarVersion = mostCurrentVersionName()
+    downDir = join("downloads", clinvarVersion)
+
+    doDownload = True
+
     try:
         os.mkdir(downDir)
     except FileExistsError:
-        pass
+        if onlyNew:
+            exit(0)
+        else:
+            logging.warning("Not downloading ClinVar again, version '%s' is already in %s" % (clinvarVersion, downDir))
+            doDownload = False
 
-    version = mostCurrentVersionName()
-    logging.info("Version on NCBI server is %s" % version)
-    versionFname = join(downDir, "version.txt")
-    open(versionFname, "w").write(version)
+    logging.info("Version on NCBI server is %s" % clinvarVersion)
 
     baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/"
     url = baseUrl+"variant_summary.txt.gz"
     outFname = join(downDir, "variant_summary.txt.gz")
     filename = outFname
 
     # get the new variant allele file
     varUrl = baseUrl+"variation_allele.txt.gz"
     varFname = join(downDir, "variation_allele.txt.gz")
 
     # get the new hgvs allele file
     hgvsUrl = baseUrl+"hgvs4variation.txt.gz"
     hgvsFname = join(downDir, "hgvs4variation.txt.gz")
 
     # get the summary file
     summUrl = baseUrl+"submission_summary.txt.gz"
     summFname = join(downDir, "submission_summary.txt.gz")
 
     # get the new VCF file
     vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz"
     vcfFname = join(downDir, "GRCh38.clinvar.vcf.gz")
 
-    if not (isfile(vcfFname)):
+    versionFname = join(downDir, "version.txt")
+
+    if doDownload:
+        logging.info("Creating %s" % versionFname)
+        open(versionFname, "w").write(clinvarVersion)
+
         logging.info("Downlading %s to %s" % (vcfUrl, vcfFname))
         open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read())
-
-    if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
         open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
         open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
         open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
         logging.info("Downlading %s to %s" % (summUrl, summFname))
         open(summFname, "wb").write(urllib.request.urlopen(summUrl).read())
 
     return filename, varFname, hgvsFname, vcfFname, summFname, versionFname
 
 def parseVcf(vcfFname):
     " parse VCF and return as dict alleleId -> (molConseq) "
     logging.info("Parsing %s" % vcfFname)
     ret = {}
@@ -685,39 +689,48 @@
             elif key=="MC":
                 mc = val.split("|")[1].split(",")[0].replace("_", " ")
                 if not mc in possMolConseqs:
                     print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. "
                             "Please update otto script %s" % (mc, __file__))
                     sys.exit(1)
 
         if mc and alleleId:
             ret[int(alleleId)] = mc
 
     return ret
 
 def shortenSeq(s):
     return s[:25]+"..."+s[-25:]
 
+def makeSubLolly(maxDiff, isAlpha):
+    "run the makeSubLolly script"
+    if maxDiff is None:
+        maxDiff = 5 # this is what was in doUpdate.sh, I admit it doesn't make sense
+    cmd = "sh ./clinvarSubLolly "+str(maxDiff)
+    if isAlpha:
+        cmd+= " --alpha"
+    assert(os.system(cmd)==0)
+
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
-    filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload)
+    filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.onlyNew)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
     vcfFname = args[3]
     versionFname = args[4]
 
 #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change
 allToVcf = parseVcf(vcfFname)
 hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname)
 
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
@@ -1054,31 +1067,31 @@
 logging.info("%d lines with feature name that was too long, shortened them" % longCount)
 logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount)
 logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3]))
 
 fnames = [hg19Bed.name, hg38Bed.name, hg19DecorBed.name, hg38DecorBed.name, hg19BedCnv.name, hg38BedCnv.name]
 
 # sort the files
 for fname in fnames:
     cmd = "bedSort %s %s" % (fname, fname)
     logging.debug(cmd)
     assert(os.system(cmd)==0)
 
 # check new bed files against last months's bed files
 if options.maxDiff and not options.isAlpha:
     for fname in fnames:
-        compareWithLastMonth(fname, options.maxDiff)
+        compareWithPrevRun(fname, options.maxDiff)
 
 bigBedDir = "bigBed"
 if options.isAlpha:
     bigBedDir = "bigBedAlpha"
 
 # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb
 tmpFnames = []
 for fname in fnames:
     parts = fname.split(".")
     if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed
         base, db, date, ext = parts
     else:
         base, db, ext = parts
 
     base = basename(base)
@@ -1101,40 +1114,43 @@
 
 # make it atomic by renaming the temp files to the final file names at the very end
 for db, tableName, tmpFname, finalFname in tmpFnames:
     logging.debug("Renaming %s to %s" % (tmpFname, finalFname))
     os.rename(tmpFname, finalFname)
 
 # create the version.txt file
 clinvarVersion = open(versionFname).read()
 versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d"))
 outFname = join(bigBedDir, "version.txt")
 with open(outFname, "w") as ofh:
     ofh.write(versionString)
 
 # put a copy of the files into the archive
 if not options.isAlpha:
-    for db, tableName, tmpFname, finalFname in tmpFnames:
-        # not doing this anymore, using bigDataUrl now in trackDb
-        #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname)
-        #mustRun(cmd)
-
+    for db, trackName, tmpFname, finalFname in tmpFnames:
         yearMonDay = date.today().strftime("%Y-%m-%d")
-        gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName)
+        gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, trackName)
         # put a copy into the archive
         outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
         if not isdir(outDir):
             os.mkdir(outDir)
         archiveFname = join(outDir, basename(gbdbFname))
         logging.info("Creating %s" % archiveFname)
         cmd = "cp %s %s" % (finalFname, archiveFname)
         mustRun(cmd)
 
+        # copy the BED files to prevRun, so we can do a diff in the next run
+        cmd = "cp bed/%s.%s.bed prevRun/" % (basename(trackName), db)
+        mustRun(cmd)
+
     # add the version.txt files
     for db in ["hg38", "hg19"]:
         archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay)
         cmd = "cp %s/version.txt %s" % (bigBedDir, archDir)
         mustRun(cmd)
 
+
 loadSubmissions(summFname)
+makeSubLolly(options.maxDiff, options.isAlpha)
 
 print("ClinVar update end: OK")
+