aaec16b62487ea4d0a0c19193428917e095c756b jnavarr5 Tue Nov 25 14:10:47 2025 -0800 Updating the clinVarToBed script to match the one the one that is running on dev, no Redmine diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index a014dc29d7a..8463933a900 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -1,19 +1,19 @@ #!/usr/bin/env python3 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess -import tempfile, json +import tempfile, json, glob from collections import defaultdict from os.path import join, basename, dirname, isfile, abspath, isdir from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "MONDO" : "https://monarchinitiative.org/disease/%s", "ClinGen" : "https://reg.clinicalgenome.org/redmine/projects/registry/genboree_registry/by_caid?caid=%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } @@ -27,31 +27,31 @@ # make sure that if the second list is edited, it is never out of sync with the first list assert(len(set(truncConseqs - possMolConseqs))==0) # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed. Output goes into the current dir Typical input files are at ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/ """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("", "--alpha", dest="isAlpha", action="store_true", help="Default target is /gbdb/{hg19,hg38}/bbi/clinvar. With this option, we use /gbdb/{hg19,hg38}/bbi/clinvarAlpha, the hgwdev only version of the track, see human/clinvarAlpha.ra in trackDb") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") -parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") +parser.add_option("", "--onlyNew", dest="onlyNew", action="store_true", help="Stop with no output to stdout if there is no new version on the Clinvar website") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) if not options.isAlpha and os.getlogin()!="otto": print("Please do not run this script as a normal user, you will run into all sorts of permission problems.") print("to develop and test, ssh to otto, then run it with doUpdate.sh --alpha to generate the dev-only tracks.") print("When you're done, run doUpdate.sh without the --alpha and commit the change to the kent tree.") sys.exit(1) #clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" @@ -141,31 +141,31 @@ "BN" : "191,191,191", "LB" : "191,191,191", "CF" : "128,128,128", "VUS" : "128,128,128", "RF" : "128,128,128", "LP" : "64,64,64", "PG" : "64,64,64" }, } def mustRun(cmd): logging.debug(cmd) ret = os.system(cmd) if ret!=0: print(("Could not run: %s" % cmd)) - sys.exit(1) + raise def shortenName(name): "make the clinvar name shorter and keep a long version for the mouse over " cnvMatch = cnvRe.match(name) #hgvsMatch = hgvsRe.match(name) if ":" in name and not "chr" in name: longName = name.split(":")[0] else: longName = name shortName = name if cnvMatch != None: # some names include the original assembly as a prefix @@ -372,38 +372,35 @@ def lastMonth(d,x=1): """ returns same day as this month, but last month, e.g. for 2013-01-01 return 2012-12-01 """ days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] newmonth = ((( d.month - 1) - x ) % 12 ) + 1 newyear = d.year + ((( d.month - 1) - x ) // 12 ) if d.day > days_of_month[newmonth-1]: newday = days_of_month[newmonth-1] else: newday = d.day return datetime( newyear, newmonth, newday) -def compareWithLastMonth(fname, maxDiff): - " go back one month from current date, make sure that linecount diff is at most maxDiff " - todayStr = date.today().isoformat() - previousDate = lastMonth(date.today()) - previousDateStr = previousDate.date().isoformat() - previousFname = fname.replace(todayStr, previousDateStr) +def compareWithPrevRun(fname, maxDiff): + " make sure that linecount diff is at most maxDiff " + previousFname = "prevRun/"+basename(fname) if not isfile(previousFname): - print(("Cannot find %s, cannot compare new data with previous month" % previousFname)) + print(("Cannot find %s, cannot compare new data with previous run" % previousFname)) return previousCount = sum(1 for line in open(previousFname, encoding="utf8")) todayCount = sum(1 for line in open(fname, encoding="utf8")) percDiff = (todayCount-previousCount)/float(todayCount) if percDiff < 0 and maxDiff: raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \ (previousFname, previousCount, fname, todayCount)) if percDiff > maxDiff: raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount)) # see RM 14350 do more than just linecount # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux cmd = "diff %s %s --speed-large-files | grep '^[><]' | wc -l" % (fname, previousFname) diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)) @@ -601,70 +598,77 @@ cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname) print(cmd) assert(os.system(cmd)==0) cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname) assert(os.system(cmd)==0) def mostCurrentVersionName(): " return name of version on FTP site " cmd = 'curl -s https://ftp.ncbi.nlm.nih.gov/pub/clinvar/xml/ClinVarVCVRelease_00-latest.xml.gz | zcat | head -n2 | tail -n1' verLine = subprocess.check_output(cmd, shell=True, encoding="utf8") # version = verLine.split(" ")[3].split("=")[1].split('"')[1] return version -def downloadFromNcbi(skipDownload): - " download files from NCBI and return list of filenames " - today = date.today().isoformat() - downDir = join("downloads", today) +def downloadFromNcbi(onlyNew): + " download files from NCBI and return list of filenames. If onlyNew is true, exit with error code 0, no output " + clinvarVersion = mostCurrentVersionName() + downDir = join("downloads", clinvarVersion) + + doDownload = True + try: os.mkdir(downDir) except FileExistsError: - pass + if onlyNew: + exit(0) + else: + logging.warning("Not downloading ClinVar again, version '%s' is already in %s" % (clinvarVersion, downDir)) + doDownload = False - version = mostCurrentVersionName() - logging.info("Version on NCBI server is %s" % version) - versionFname = join(downDir, "version.txt") - open(versionFname, "w").write(version) + logging.info("Version on NCBI server is %s" % clinvarVersion) baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/" url = baseUrl+"variant_summary.txt.gz" outFname = join(downDir, "variant_summary.txt.gz") filename = outFname # get the new variant allele file varUrl = baseUrl+"variation_allele.txt.gz" varFname = join(downDir, "variation_allele.txt.gz") # get the new hgvs allele file hgvsUrl = baseUrl+"hgvs4variation.txt.gz" hgvsFname = join(downDir, "hgvs4variation.txt.gz") # get the summary file summUrl = baseUrl+"submission_summary.txt.gz" summFname = join(downDir, "submission_summary.txt.gz") # get the new VCF file vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz" vcfFname = join(downDir, "GRCh38.clinvar.vcf.gz") - if not (isfile(vcfFname)): + versionFname = join(downDir, "version.txt") + + if doDownload: + logging.info("Creating %s" % versionFname) + open(versionFname, "w").write(clinvarVersion) + logging.info("Downlading %s to %s" % (vcfUrl, vcfFname)) open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read()) - - if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)): logging.info("Downlading %s to %s" % (url, outFname)) open(outFname, "wb").write(urllib.request.urlopen(url).read()) logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) logging.info("Downlading %s to %s" % (varUrl, varFname)) open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) logging.info("Downlading %s to %s" % (summUrl, summFname)) open(summFname, "wb").write(urllib.request.urlopen(summUrl).read()) return filename, varFname, hgvsFname, vcfFname, summFname, versionFname def parseVcf(vcfFname): " parse VCF and return as dict alleleId -> (molConseq) " logging.info("Parsing %s" % vcfFname) ret = {} @@ -685,39 +689,48 @@ elif key=="MC": mc = val.split("|")[1].split(",")[0].replace("_", " ") if not mc in possMolConseqs: print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. " "Please update otto script %s" % (mc, __file__)) sys.exit(1) if mc and alleleId: ret[int(alleleId)] = mc return ret def shortenSeq(s): return s[:25]+"..."+s[-25:] +def makeSubLolly(maxDiff, isAlpha): + "run the makeSubLolly script" + if maxDiff is None: + maxDiff = 5 # this is what was in doUpdate.sh, I admit it doesn't make sense + cmd = "sh ./clinvarSubLolly "+str(maxDiff) + if isAlpha: + cmd+= " --alpha" + assert(os.system(cmd)==0) + # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: - filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.skipDownload) + filename, varFName, hgvsFname, vcfFname, summFname, versionFname = downloadFromNcbi(options.onlyNew) else: filename = args[0] varFname = args[1] hgvsFname = args[2] vcfFname = args[3] versionFname = args[4] #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change allToVcf = parseVcf(vcfFname) hg19Hgvs, hg38Hgvs = parseHgvsFile(hgvsFname) # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant @@ -1054,31 +1067,31 @@ logging.info("%d lines with feature name that was too long, shortened them" % longCount) logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) logging.info("%d lines with -1/-1 start/end positions skipped. Examples: %s" % (len(minusAccs), minusAccs[:3])) fnames = [hg19Bed.name, hg38Bed.name, hg19DecorBed.name, hg38DecorBed.name, hg19BedCnv.name, hg38BedCnv.name] # sort the files for fname in fnames: cmd = "bedSort %s %s" % (fname, fname) logging.debug(cmd) assert(os.system(cmd)==0) # check new bed files against last months's bed files if options.maxDiff and not options.isAlpha: for fname in fnames: - compareWithLastMonth(fname, options.maxDiff) + compareWithPrevRun(fname, options.maxDiff) bigBedDir = "bigBed" if options.isAlpha: bigBedDir = "bigBedAlpha" # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb tmpFnames = [] for fname in fnames: parts = fname.split(".") if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed base, db, date, ext = parts else: base, db, ext = parts base = basename(base) @@ -1101,40 +1114,43 @@ # make it atomic by renaming the temp files to the final file names at the very end for db, tableName, tmpFname, finalFname in tmpFnames: logging.debug("Renaming %s to %s" % (tmpFname, finalFname)) os.rename(tmpFname, finalFname) # create the version.txt file clinvarVersion = open(versionFname).read() versionString = "ClinVar Release: %s, converted at UCSC on %s" % (clinvarVersion, date.today().strftime("%Y-%m-%d")) outFname = join(bigBedDir, "version.txt") with open(outFname, "w") as ofh: ofh.write(versionString) # put a copy of the files into the archive if not options.isAlpha: - for db, tableName, tmpFname, finalFname in tmpFnames: - # not doing this anymore, using bigDataUrl now in trackDb - #cmd = "hgBbiDbLink %s %s %s" % (db, tableName, gbdbFname) - #mustRun(cmd) - + for db, trackName, tmpFname, finalFname in tmpFnames: yearMonDay = date.today().strftime("%Y-%m-%d") - gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName) + gbdbFname = "/gbdb/%s/bbi/clinvar/%s.bb" % (db, trackName) # put a copy into the archive outDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay) if not isdir(outDir): os.mkdir(outDir) archiveFname = join(outDir, basename(gbdbFname)) logging.info("Creating %s" % archiveFname) cmd = "cp %s %s" % (finalFname, archiveFname) mustRun(cmd) + # copy the BED files to prevRun, so we can do a diff in the next run + cmd = "cp bed/%s.%s.bed prevRun/" % (basename(trackName), db) + mustRun(cmd) + # add the version.txt files for db in ["hg38", "hg19"]: archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/clinvar/%s" % (db, yearMonDay) cmd = "cp %s/version.txt %s" % (bigBedDir, archDir) mustRun(cmd) + loadSubmissions(summFname) +makeSubLolly(options.maxDiff, options.isAlpha) print("ClinVar update end: OK") +