8fa777e7a626753ecf40a7471999dba3a984e272 max Tue Apr 21 09:21:39 2020 -0700 more clinvar improvements, refs #24850 diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index eb1580c..9586f99 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -2,30 +2,34 @@ import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess from collections import defaultdict from os.path import join, basename, dirname, isfile, abspath from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } +# since we're filtering on it, we make sure that we have all molecular consequences in the tdb file +# if they add a new one, this script must fail and tdb must be updated. +possMolConseqs = set(["genic downstream transcript variant","no sequence alteration","inframe indel","stop lost","genic upstream transcript variant","initiatior codon variant","inframe insertion","inframe deletion","","splice acceptor variant","splice donor variant","5 prime UTR variant","nonsense","non-coding transcript variant","3 prime UTR variant","frameshift variant","intron variant","synonymous variant","missense variant", ""]) + # === COMMAND LINE INTERFACE, OPTIONS AND HELP === parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed Output goes into the current dir input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: @@ -82,31 +86,110 @@ shortName = "dup" else: posMatch = posRe.match(name) if posMatch!=None: pos, dupDelIns, change = posMatch.groups() if dupDelIns==None: dupDelIns= "" if change==None or change=="": shortName = pos+dupDelIns else: shortName = dupDelIns+change return shortName, longName -def clinSignToPathoCode(pathoStr): +def allTypeToCode(allType): + """ + Map the allele type field from ClinVal to one of these classes for filtering: + SNV = single nucleotide variant + INS = insertion + DEL = deletion + INDEL = indel + DUPL = duplication + OTH = other + + In 3/2020, these were all possible values: + + fusion 6 0.000% + complex 61 0.005% + protein only 103 0.008% + NT expansion 142 0.011% + Translocation 302 0.022% + inversion 583 0.043% + undetermined variant 866 0.064% + insertion 7133 0.530% + indel 8490 0.630% + short repeat 18778 1.394% + duplication 34148 2.535% + copy number loss 40441 3.002% + copy number gain 40671 3.019% + deletion 77325 5.740% + single nucleotide variant 1118033 82.997% + + """ + if allType=="single nucleotide variant": + return "SNV" + elif allType in ["deletion", "copy number loss"]: + return "DEL" + elif allType in ["duplication", "copy number gain", "tandem duplication"]: + return "DUPL" + elif allType=="indel": + return "INDEL" + elif allType in ["insertion", "NT expansion", "short repeat"]: + return "INS" + elif allType in ["Translocation", "fusion", "complex", "protein only", "inversion", "undetermined variant"]: + return "OTH" + else: + print("found unknown allele type: "+allType) + assert(False) # found new allele type in ClinVar table : code needs fixing + +def originSimpleToCode(originSimple): + """ + for filtering, group the simplified origins into simpler origin codes: + return one of these values: + "GERM" = germline + "SOM" = somatic + "GERMSOM" = germline/somatic + "NOVO" = de novo + "UNK" = unknown + + as of 3/2020, these are the values in originSimple and how often they appear: + + tested-inconclusive 91 0.007% + not applicable 193 0.014% + germline/somatic 1808 0.134% + somatic 7929 0.589% + not provided 53173 3.947% + unknown 72601 5.389% + germline 1211288 89.919% + """ + if originSimple=="germline": + return "GERM" + elif originSimple=="germline/somatic": + return "GERMSOM" + elif originSimple in ["not applicable", "not provided", "unknown", "tested-inconclusive"]: + return "UNK" + elif originSimple in ["not applicable", "not provided", "unknown"]: + return "UNK" + elif originSimple in ["somatic"]: + return "SOM" + else: + print("found new origin string: "+originSimple) + assert(False) # Clinvar has added a new origin string + +def clinSignToCode(pathoStr): """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes: - benign = BN likely benign = LB conflicting = CF likely pathogenic= LP pathogenic = PG other = OT uncertain = UC examples: Benign/Likely benign,other 13 0.002% Conflicting interpretations of pathogenicity,risk factor 13 0.002% Likely benign,other 18 0.003% Pathogenic,drug response 28 0.005% @@ -158,32 +241,32 @@ newday = days_of_month[newmonth-1] else: newday = d.day return datetime( newyear, newmonth, newday) def compareWithLastMonth(fname, maxDiff): " go back one month from current date, make sure that linecount diff is at most maxDiff " todayStr = date.today().isoformat() previousDate = lastMonth(date.today()) previousDateStr = previousDate.date().isoformat() previousFname = fname.replace(todayStr, previousDateStr) if not isfile(previousFname): print(("Cannot find %s, cannot compare new data with previous month" % previousFname)) return - previousCount = sum(1 for line in open(previousFname)) - todayCount = sum(1 for line in open(fname)) + previousCount = sum(1 for line in open(previousFname, encoding="utf8")) + todayCount = sum(1 for line in open(fname, encoding="utf8")) percDiff = (todayCount-previousCount)/float(todayCount) if percDiff < 0 and maxDiff: raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \ (previousFname, previousCount, fname, todayCount)) if percDiff > maxDiff: raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount)) # see RM 14350 do more than just linecount # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux cmd = "diff %s %s --speed-large-files | grep '^[\>\<]' | wc -l" % (fname, previousFname) diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)) logging.info("%s: %d lines changed" % (fname, diffCount)) percDiff = float(diffCount)/todayCount if percDiff > maxDiff: logging.info(cmd) @@ -317,66 +400,108 @@ htmlStr = "".join(htmlList) htmlStr += " <small>based on: %s</small>" % reviewStatus if starCount==1: asciiDesc = "%d star" % starCount else: asciiDesc = "%d stars" % starCount return htmlStr, asciiDesc, starCount def downloadFromNcbi(skipDownload): " download files from NCBI and return triple of filenames " today = date.today().isoformat() outSuffix = "%s." % today - url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" + baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/" + url = baseUrl+"variant_summary.txt.gz" outFname = "archive/variant_summary.%s.txt.gz" % today filename = outFname # get the new variant allele file - varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz" + varUrl = baseUrl+"variation_allele.txt.gz" varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today) # get the new hgvs allele file - hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz" + hgvsUrl = baseUrl+"hgvs4variation.txt.gz" hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today) + # get the new VCF file + vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz" + vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today) + + if not (isfile(vcfFname)): + logging.info("Downlading %s to %s" % (vcfUrl, vcfFname)) + open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read()) + if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): logging.info("Downlading %s to %s" % (url, outFname)) open(outFname, "wb").write(urllib.request.urlopen(url).read()) logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) logging.info("Downlading %s to %s" % (varUrl, varFname)) open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) - return filename, varFname, hgvsFname + + return filename, varFname, hgvsFname, vcfFname + +def parseVcf(vcfFname): + " parse VCF and return as dict alleleId -> (molConseq) " + logging.info("Parsing %s" % vcfFname) + ret = {} + for line in gzip.open(vcfFname, "rt"): + # #CHROM POS ID REF ALT QUAL FILTER INFO + #1 930248 789256 G A . . ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS= + if line.startswith("#"): + continue + chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t") + infoParts = info.split(";") + + alleleId = None + mc = None + for info in infoParts: + key, val = info.split("=") + if key=="ALLELEID": + alleleId = val + elif key=="MC": + mc = val.split("|")[1].split(",")[0].replace("_", " ") + if not mc in possMolConseqs: + print("molecular consequence '%s' in VCF file is not known and so not in trackDb file. ", + "Please update otto script %s" % (mc, __file__)) + sys.exit(1) + + if mc and alleleId: + ret[int(alleleId)] = mc + + return ret # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: - filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload) + filename, varFName, hgvsFname, vcfFname = downloadFromNcbi(options.skipDownload) else: filename = args[0] varFname = args[1] hgvsFname = args[2] + vcfFname = args[3] #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change +allToVcf = parseVcf(vcfFname) allToHgvs = parseHgvsFile(hgvsFname) # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 @@ -388,50 +513,57 @@ line1 = ifh.readline() if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again logging.error("ClinVar has changed their header line again") logging.error("current header line: %s" % line1.replace("\t","|")) logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) raise Exception("code needs fixing") # open out files hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w", encoding="latin1") hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w", encoding="latin1") hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w", encoding="latin1") longCount = 0 noAssCount = 0 +noMcCount = 0 # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ submCategories, varId = row # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) + molConseq = allToVcf.get(int(alleleId)) + if molConseq is None: + logging.debug("%s has no molConseq" % alleleId) + noMcCount += 1 + molConseq = "" + if chrom=="" or assembly=="" or assembly=="NCBI36": noAssCount += 1 continue if chrom=="Un" and assembly=="GRCh38": print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) continue chrom = "chr"+chrom if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? if assembly=="GRCh37": chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 else: chrom = "chrM" @@ -467,84 +599,98 @@ thickStart = start thickEnd = end itemRgb = "0" blockCount = "1" blockSizes = str(int(end)-int(start)) blockStarts = "0" # used until Oct 2017 # if dbVarAcc.startswith("nsv") or "copy number" in allType: # now using just size on genome, see #20268 if (int(end)-int(start))>100: isCnv = True else: isCnv = False - pathoCode = clinSignToPathoCode(clinSign) + pathoCode = clinSignToCode(clinSign) + originCode = originSimpleToCode(originSimple) + allTypeCode = allTypeToCode(allType) if isCnv: - if "gain" in allType or "duplication" in allType: + if "gain" in allType or "duplication" in allType or "insertion" in allType: itemRgb = "0,0,255" if "deletion" in allType or "loss" in allType: itemRgb = "255,0,0" else: if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic - itemRgb = "210,0,0" + itemRgb = "210,0,0" # red elif pathoCode in ["BN", "LB"]: # benign or likely benign - itemRgb = "0,210,0" + itemRgb = "0,210,0" # green + elif pathoCode in ["UC"]: # uncertain + itemRgb = "0,0,128" # dark-blue elif pathoCode in ["CF"]: # conflicting itemRgb = "137,121,212" # light-blue - elif pathoCode in ["OT", "UC"]: # uncertain or other + elif pathoCode in ["OT"]: # other itemRgb = "128,128,128" # grey else: assert(False)# should never happen varLen = int(end)-int(start) geneStr = geneSymbol if geneId.isdigit() and geneId!="-1": geneStr = geneId+"|"+geneSymbol if len(geneStr)>250: geneStr = "not shown, too many genes" + mouseOverName = name + if name=="": name = "No display name was provided by ClinVar for this variant" shortName = "NoName" + mouseOverName = "" if len(name)>90: name = name[:90]+"..." name = varId+"|"+name + if len(mouseOverName)>80: + mouseOverName = mouseOverName[:80]+"..." + if len(hgvsProt) > 90: hgvsProt = hgvsProt[:90]+"..." if len(hgvsCod) > 90: hgvsCod = hgvsCod[:90]+"..." otherIds = accListToHtml(otherIds) phenotypeIds = accListToHtml(phenotypeIds) starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) - mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"] + mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign, + "Origin: "+origin, "Phenotypes: "+phenotypeList] + # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars mouseOver = ", ".join(mouseOverParts) row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, - name, clinSign, starRatingHtml, allType, geneStr, + name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, - hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)] + hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, + # these fields are the for the filters, not for the display + pathoCode, originCode, allTypeCode, str(varLen), str(starCount)] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38":