e95d24a1c7ced8663574b8f90aa255b897c3c423 max Fri Jan 31 05:05:55 2020 -0800 adding a few of our brand new bigBed filters to the clinvar track, refs #24850 diff --git src/hg/utils/otto/clinVarToBed src/hg/utils/otto/clinVarToBed new file mode 100755 index 0000000..ee04313 --- /dev/null +++ src/hg/utils/otto/clinVarToBed @@ -0,0 +1,610 @@ +#!/usr/bin/env python3 + +import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess +from collections import defaultdict +from os.path import join, basename, dirname, isfile, abspath +from datetime import date, datetime, timedelta + +dbToUrl = { + "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", + "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", + "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", + "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", + "OMIM" : "http://www.omim.org/entry/%s", + "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" +} + +# === COMMAND LINE INTERFACE, OPTIONS AND HELP === +parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed + +Output goes into the current dir + +input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz +""") + +parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") +parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") +parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") +parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") +(options, args) = parser.parse_args() + +if options.debug: + logging.basicConfig(level=logging.DEBUG) +else: + logging.basicConfig(level=logging.INFO) + +clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID\n" +# ==== FUNCTIONs ===== + +def mustRun(cmd): + logging.debug(cmd) + ret = os.system(cmd) + if ret!=0: + print(("Could not run: %s" % cmd)) + sys.exit(1) + +def shortenName(name): + "make the clinvar name shorter and keep a long version for the mouse over " + cnvMatch = cnvRe.match(name) + #hgvsMatch = hgvsRe.match(name) + + if ":" in name and not "chr" in name: + longName = name.split(":")[0] + else: + longName = name + + shortName = name + + if cnvMatch != None: + # some names include the original assembly as a prefix + band, copy = cnvMatch.groups() + if copy == None: + copy = "" + shortName = band+copy + elif name.startswith("GRCh"): + # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant + print(name) + assert False # name that starts with GRCh but does not match our regex should not happen + elif "." in name: + # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del) + # so we remove the gene and the protein description + if ":" in name: + name = name.split(":")[1] + longName = name.split(" ")[0] + name = name.split(" ")[0].split("[")[0].split("(")[0] + shortName = name + + if name.endswith("del"): + shortName = "del" + elif name.endswith("ins"): + shortName = "ins" + elif name.endswith("dup"): + shortName = "dup" + else: + posMatch = posRe.match(name) + if posMatch!=None: + pos, dupDelIns, change = posMatch.groups() + if dupDelIns==None: + dupDelIns= "" + + if change==None or change=="": + shortName = pos+dupDelIns + else: + shortName = dupDelIns+change + + return shortName, longName + +def clinSignToPathoCode(pathoStr): + """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes: + - benign = BN + likely benign = LB + conflicting = CF + likely pathogenic= LP + pathogenic = PG + other = OT + uncertain = UC + + examples: + + Benign/Likely benign,other 13 0.002% + Conflicting interpretations of pathogenicity,risk factor 13 0.002% + Likely benign,other 18 0.003% + Pathogenic,drug response 28 0.005% + protective 32 0.005% + Pathogenic,risk factor 33 0.005% + Pathogenic,other 102 0.017% + Affects 120 0.019% + association 202 0.033% + drug response 387 0.063% + risk factor 444 0.072% + no interpretation for the single variant 564 0.092% + other 1687 0.274% + Pathogenic/Likely pathogenic 5209 0.846% + not provided 8860 1.440% + Benign/Likely benign 19489 3.167% + Likely pathogenic 30200 4.907% + Conflicting interpretations of pathogenicity 30910 5.023% + Pathogenic 66679 10.835% + Benign 77551 12.602% + Likely benign 154870 25.166% + Uncertain significance 217838 35.399% + + """ + pathoStr = pathoStr.lower() + # don't change the order of these unless you tested it. The order is crucial. + if "conflicting" in pathoStr: + return "CF" + if "likely pathogenic" in pathoStr: + return "LP" + if "likely benign" in pathoStr: + return "LB" + if "uncertain" in pathoStr: + return "UC" + if "pathogenic" in pathoStr: + return "PG" + if "benign" in pathoStr: + return "BN" + return "OT" + +def lastMonth(d,x=1): + """ returns same day as this month, but last month, e.g. for 2013-01-01 + return 2012-12-01 + """ + + days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] + newmonth = ((( d.month - 1) - x ) % 12 ) + 1 + newyear = d.year + ((( d.month - 1) - x ) // 12 ) + if d.day > days_of_month[newmonth-1]: + newday = days_of_month[newmonth-1] + else: + newday = d.day + return datetime( newyear, newmonth, newday) + +def compareWithLastMonth(fname, maxDiff): + " go back one month from current date, make sure that linecount diff is at most maxDiff " + todayStr = date.today().isoformat() + previousDate = lastMonth(date.today()) + previousDateStr = previousDate.date().isoformat() + previousFname = fname.replace(todayStr, previousDateStr) + if not isfile(previousFname): + print(("Cannot find %s, cannot compare new data with previous month" % previousFname)) + return + + previousCount = sum(1 for line in open(previousFname)) + todayCount = sum(1 for line in open(fname)) + percDiff = (todayCount-previousCount)/float(todayCount) + + if percDiff < 0 and maxDiff: + raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \ + (previousFname, previousCount, fname, todayCount)) + if percDiff > maxDiff: + raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount)) + # see RM 14350 do more than just linecount + # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux + cmd = "diff %s %s --speed-large-files | grep '^[\>\<]' | wc -l" % (fname, previousFname) + diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)) + logging.info("%s: %d lines changed" % (fname, diffCount)) + percDiff = float(diffCount)/todayCount + if percDiff > maxDiff: + logging.info(cmd) + diffFname = abspath(join("archive", "lastDiff.txt")) + cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname) + os.system(cmd) # ret code is not 0, as there are differences, so no need to check + raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname)) + +# I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse +# the other files again +#def parseVariationAlleleFile(varFname): +# """ return dict allele Id -> variant ID """ +# logging.info("Parsing %s" % varFname) +# allToVar = {} +# foundHead = False +# expHead = "#VariationID Type AlleleID Interpreted" +# for line in gzip.open(varFname): +# line = line.rstrip("\n") +# if line==expHead: +# foundHead = True +# if line.startswith("#Var") and "Type" in line: +# if not line == expHead: +# print(("Header of variation_allele.txt.gz has changed again. Code needs fixing")) +# print("found:",line) +# print("expected:",expHead) +# else: +# foundHead = True +# +# if line.startswith("#"): +# continue +# if not foundHead: +# raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing") +# fields = line.split("\t") +# allToVar[fields[2]]=fields[0] +# return allToVar + +def parseHgvsFile(hgvsFname): + " return dict: allele Id -> (hgvs coding, hgvs protein) " + logging.info("Parsing %s" % hgvsFname) + expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene" + allToHgvs = {} + foundHead = False + for line in gzip.open(hgvsFname, "rt"): + line = line.rstrip("\n") + if line==expHead: + foundHead = True + if line.startswith("#Symbol") and "GeneID" in line: + if not line == expHead: + print ("Header of hgvs file has changed again. Code needs fixing") + print(("found:",line)) + print(("expected:",expHead)) + else: + foundHead = True + + if line.startswith("#"): + continue + if not foundHead: + raise Exception("Invalid Header. Code needs fixing, again.") + + fields = line.split("\t") + varType = fields[4] + if "coding" in varType and not "LRG" in fields[6]: + allToHgvs[fields[3]]=(fields[6], fields[8]) + return allToHgvs + +def accListToHtml(inStr): + """ + given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions + to HTML links to the databases + """ + newParts = [] + for part in inStr.split(","): + fields = part.split(":") + if len(fields)!=2: + newParts.append(part) + continue + + db, acc = fields + accForUrl = acc + + if db in dbToUrl: + if "OMIM" in db: + accForUrl = accForUrl.replace(".", "#") + url = (dbToUrl[db] % accForUrl) + newPart = "%s:%s" % (url, db, acc) + else: + newPart = part + newParts.append(newPart) + return ", ".join(newParts) + +def reviewStatusToHtmlStars(reviewStatus): + """ + In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this: + 1-criteria provided, conflicting interpretations + 2-criteria provided, multiple submitters, no conflicts + 1-criteria provided, single submitter + 0-no assertion criteria provided + 0-no assertion for the individual variant + 0-no assertion provided + 4-practice guideline + 3-reviewed by expert panel + + given the review status, return an html string with the correct number of black stars + """ + revStatus = reviewStatus.strip().lower().replace(" ", "") + starCount = None + + if revStatus == "nointerpretationforthesinglevariant": + starCount = 0 + if revStatus=="criteriaprovided,conflictinginterpretations": + starCount = 1 + if revStatus=="criteriaprovided,multiplesubmitters,noconflicts": + starCount = 2 + if revStatus=="criteriaprovided,singlesubmitter": + starCount = 1 + if revStatus == "practiceguideline": + starCount = 4 + if revStatus == "reviewedbyexpertpanel": + starCount = 3 + if "noassertion" in revStatus or revStatus=="-": + starCount = 0 + + if starCount == None: + print(("Illegal review status: %s" % revStatus)) + assert(False) + + emptyStarCount = 4-starCount + + htmlList = ["★"]*starCount + htmlList.extend( ["☆"]*emptyStarCount ) + + htmlStr = "".join(htmlList) + htmlStr += "  based on: %s" % reviewStatus + + asciiDesc = "%d stars" % starCount + return htmlStr, asciiDesc, starCount + +def downloadFromNcbi(skipDownload): + " download files from NCBI and return triple of filenames " + today = date.today().isoformat() + outSuffix = "%s." % today + + url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" + outFname = "archive/variant_summary.%s.txt.gz" % today + filename = outFname + + # get the new variant allele file + varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz" + varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today) + + # get the new hgvs allele file + hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz" + hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today) + + if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): + logging.info("Downlading %s to %s" % (url, outFname)) + open(outFname, "wb").write(urllib.request.urlopen(url).read()) + logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) + open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) + logging.info("Downlading %s to %s" % (varUrl, varFname)) + open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) + return filename, varFname, hgvsFname + +# ----------- MAIN -------------- +if args==[] and not options.auto: + parser.print_help() + exit(1) + +outSuffix = "" + +if options.auto: + filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload) +else: + filename = args[0] + varFname = args[1] + hgvsFname = args[2] + +#allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change +allToHgvs = parseHgvsFile(hgvsFname) + +# e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 +# GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) +# GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) +# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant +cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') + +hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') + +# e.g. +# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA +# c.80A>T +# c.740+356C>T +# c.3839_3855del17 +posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') + +ifh = gzip.open(filename, "rt") + +# check the header +line1 = ifh.readline() +if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again + logging.error("ClinVar has changed their header line again") + logging.error("current header line: %s" % line1.replace("\t","|")) + logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) + raise Exception("code needs fixing") + +# open out files +hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w") +hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w") +hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w") +hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w") + +longCount = 0 +noAssCount = 0 + +# convert lines +for line in ifh: + line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions + line = line.rstrip("\n") + row = line.split("\t") + row = [f.replace("\0", "") for f in row ] + + alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ + irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ + refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ + submCategories, varId = row + + # changed in July 2018 when the "varId" was merged into this table. It used to be + # in allToVar + if varId=="": + logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) + + hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) + + if chrom=="" or assembly=="" or assembly=="NCBI36": + noAssCount += 1 + continue + + if chrom=="Un" and assembly=="GRCh38": + print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) + continue + chrom = "chr"+chrom + + if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? + if assembly=="GRCh37": + chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 + else: + chrom = "chrM" + + shortName, longName = shortenName(name) + + if len(shortName)>20: + shortName = shortName[:17]+"..." + longCount+=1 + + if len(longName)>60: + longName = longName[:60]+"..." + + if start=="" or end=="": + print(("undefined start or end coordinate. record %s"% irvcAcc)) + continue + + if int(start)<0 or int(end)<0: + print(("illegal start or end coordinate. record %s"% irvcAcc)) + continue + + # sometimes they seem to inverse their coordinates + if int(start)>int(end): + start, end = end, start + + if chrom=="chr17" and (end=="217748468" or end=="84062405"): + print("blacklisted feature") + continue + + # reorder columns for bigBed + start = str(int(start)-1) # convert to zero-based, half-open + score = "0" + strand = "." + thickStart = start + thickEnd = end + itemRgb = "0" + blockCount = "1" + blockSizes = str(int(end)-int(start)) + blockStarts = "0" + + # used until Oct 2017 + # if dbVarAcc.startswith("nsv") or "copy number" in allType: + # now using just size on genome, see #20268 + if (int(end)-int(start))>100: + isCnv = True + else: + isCnv = False + + if isCnv: + if "gain" in allType or "duplication" in allType: + itemRgb = "0,0,255" + if "deletion" in allType or "loss" in allType: + itemRgb = "255,0,0" + else: + isPat, isBen = False, False + if "pathogenic" in clinSign.lower(): + isPat = True + if "benign" in clinSign.lower(): + isBen = True + + if (isPat==True and isBen==True) or (isPat==False and isBen==False): + itemRgb = "128,128,128" + elif isPat==True: + itemRgb = "210,0,0" + elif isBen==True: + itemRgb = "0,210,0" + else: + assert(False)# should never happen + + pathoCode = clinSignToPathoCode(clinSign) + varLen = int(end)-int(start) + + geneStr = geneSymbol + if geneId.isdigit() and geneId!="-1": + geneStr = geneId+"|"+geneSymbol + if len(geneStr)>250: + geneStr = "not shown, too many genes" + + if name=="": + name = "No display name was provided by ClinVar for this variant" + shortName = "NoName" + + if len(name)>90: + name = name[:90]+"..." + name = varId+"|"+name + + if len(hgvsProt) > 90: + hgvsProt = hgvsProt[:90]+"..." + if len(hgvsCod) > 90: + hgvsCod = hgvsCod[:90]+"..." + + otherIds = accListToHtml(otherIds) + + phenotypeIds = accListToHtml(phenotypeIds) + + starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) + + mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"] + mouseOver = ", ".join(mouseOverParts) + + row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ + blockCount, blockSizes, blockStarts, + name, clinSign, starRatingHtml, allType, geneStr, + snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, + hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)] + + # replace clinvar's placeholders with real empty fields + newRow = [] + for x in row: + if x in ["-1", "-"]: + newRow.append("") + else: + newRow.append(x) + row = newRow + + if assembly=="GRCh37": + ofh = hg19Bed + if isCnv: + ofh = hg19BedCnv + elif assembly=="GRCh38": + ofh = hg38Bed + if isCnv: + ofh = hg38BedCnv + else: + noAssCount +=1 + + ofh.write("\t".join(row)) + ofh.write("\n") + +hg19Bed.close() +hg38Bed.close() +hg19BedCnv.close() +hg38BedCnv.close() + +logging.info("%d lines with feature name that was too long, shortened them" % longCount) +logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) + +fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name] + +# sort the files +for fname in fnames: + cmd = "bedSort %s %s" % (fname, fname) + logging.debug(cmd) + assert(os.system(cmd)==0) + +# check new bed files against last months's bed files +if options.maxDiff: + for fname in fnames: + compareWithLastMonth(fname, options.maxDiff) + +# convert to bigBed without the date in the fname, like clinvarMain.hg19.bb +tmpFnames = [] +for fname in fnames: + parts = fname.split(".") + if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed + base, db, date, ext = parts + else: + base, db, ext = parts + + outFname = "%s.%s.bb" % (base, db) + tmpFname = "%s.%s.temp.bb" % (base, db) + + outFname = basename(outFname) # strip the archive/ part, put into current dir + cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) + mustRun(cmd) + + tableName = outFname.split('.')[0] # e.g. clinvarMain + + tmpFnames.append( (db, tableName, tmpFname, outFname) ) + +# make it atomic by renaming the temp files to the final file names at the very end +for db, tableName, tmpFname, finalFname in tmpFnames: + os.rename(tmpFname, finalFname) + +# make sure that the "Data last updated" in the table browser is correct +for db, tableName, tmpFname, finalFname in tmpFnames: + cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName) + mustRun(cmd) + +print("ClinVar update end: OK")