410ca755b501b9c84b618ef8e49b9208cff3d145 max Fri Jan 31 05:40:19 2020 -0800 oops, previous commit was into wrong directory diff --git src/hg/utils/otto/clinVarToBed src/hg/utils/otto/clinVarToBed deleted file mode 100755 index ee04313..0000000 --- src/hg/utils/otto/clinVarToBed +++ /dev/null @@ -1,610 +0,0 @@ -#!/usr/bin/env python3 - -import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess -from collections import defaultdict -from os.path import join, basename, dirname, isfile, abspath -from datetime import date, datetime, timedelta - -dbToUrl = { - "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", - "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", - "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", - "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", - "OMIM" : "http://www.omim.org/entry/%s", - "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" -} - -# === COMMAND LINE INTERFACE, OPTIONS AND HELP === -parser = optparse.OptionParser("""usage: %prog [options] summaryFname varAllFname hgvsFname - check and convert the three main clinVar tab-sep files to four bed files, split into CNV and shorter mutations, for both hg19 and hg38 and convert all to bigBed - -Output goes into the current dir - -input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz -""") - -parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") -parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") -parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") -parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") -(options, args) = parser.parse_args() - -if options.debug: - logging.basicConfig(level=logging.DEBUG) -else: - logging.basicConfig(level=logging.INFO) - -clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID\n" -# ==== FUNCTIONs ===== - -def mustRun(cmd): - logging.debug(cmd) - ret = os.system(cmd) - if ret!=0: - print(("Could not run: %s" % cmd)) - sys.exit(1) - -def shortenName(name): - "make the clinvar name shorter and keep a long version for the mouse over " - cnvMatch = cnvRe.match(name) - #hgvsMatch = hgvsRe.match(name) - - if ":" in name and not "chr" in name: - longName = name.split(":")[0] - else: - longName = name - - shortName = name - - if cnvMatch != None: - # some names include the original assembly as a prefix - band, copy = cnvMatch.groups() - if copy == None: - copy = "" - shortName = band+copy - elif name.startswith("GRCh"): - # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant - print(name) - assert False # name that starts with GRCh but does not match our regex should not happen - elif "." in name: - # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del) - # so we remove the gene and the protein description - if ":" in name: - name = name.split(":")[1] - longName = name.split(" ")[0] - name = name.split(" ")[0].split("[")[0].split("(")[0] - shortName = name - - if name.endswith("del"): - shortName = "del" - elif name.endswith("ins"): - shortName = "ins" - elif name.endswith("dup"): - shortName = "dup" - else: - posMatch = posRe.match(name) - if posMatch!=None: - pos, dupDelIns, change = posMatch.groups() - if dupDelIns==None: - dupDelIns= "" - - if change==None or change=="": - shortName = pos+dupDelIns - else: - shortName = dupDelIns+change - - return shortName, longName - -def clinSignToPathoCode(pathoStr): - """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes: - - benign = BN - likely benign = LB - conflicting = CF - likely pathogenic= LP - pathogenic = PG - other = OT - uncertain = UC - - examples: - - Benign/Likely benign,other 13 0.002% - Conflicting interpretations of pathogenicity,risk factor 13 0.002% - Likely benign,other 18 0.003% - Pathogenic,drug response 28 0.005% - protective 32 0.005% - Pathogenic,risk factor 33 0.005% - Pathogenic,other 102 0.017% - Affects 120 0.019% - association 202 0.033% - drug response 387 0.063% - risk factor 444 0.072% - no interpretation for the single variant 564 0.092% - other 1687 0.274% - Pathogenic/Likely pathogenic 5209 0.846% - not provided 8860 1.440% - Benign/Likely benign 19489 3.167% - Likely pathogenic 30200 4.907% - Conflicting interpretations of pathogenicity 30910 5.023% - Pathogenic 66679 10.835% - Benign 77551 12.602% - Likely benign 154870 25.166% - Uncertain significance 217838 35.399% - - """ - pathoStr = pathoStr.lower() - # don't change the order of these unless you tested it. The order is crucial. - if "conflicting" in pathoStr: - return "CF" - if "likely pathogenic" in pathoStr: - return "LP" - if "likely benign" in pathoStr: - return "LB" - if "uncertain" in pathoStr: - return "UC" - if "pathogenic" in pathoStr: - return "PG" - if "benign" in pathoStr: - return "BN" - return "OT" - -def lastMonth(d,x=1): - """ returns same day as this month, but last month, e.g. for 2013-01-01 - return 2012-12-01 - """ - - days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] - newmonth = ((( d.month - 1) - x ) % 12 ) + 1 - newyear = d.year + ((( d.month - 1) - x ) // 12 ) - if d.day > days_of_month[newmonth-1]: - newday = days_of_month[newmonth-1] - else: - newday = d.day - return datetime( newyear, newmonth, newday) - -def compareWithLastMonth(fname, maxDiff): - " go back one month from current date, make sure that linecount diff is at most maxDiff " - todayStr = date.today().isoformat() - previousDate = lastMonth(date.today()) - previousDateStr = previousDate.date().isoformat() - previousFname = fname.replace(todayStr, previousDateStr) - if not isfile(previousFname): - print(("Cannot find %s, cannot compare new data with previous month" % previousFname)) - return - - previousCount = sum(1 for line in open(previousFname)) - todayCount = sum(1 for line in open(fname)) - percDiff = (todayCount-previousCount)/float(todayCount) - - if percDiff < 0 and maxDiff: - raise Exception("line count change negative: %s %d, %s %d. Run without --maxDiff to ignore this error. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % \ - (previousFname, previousCount, fname, todayCount)) - if percDiff > maxDiff: - raise Exception("line count increased too much: yesterday %d , today %d. Check if the input file is OK. If yes, remove the --maxDiff option from the clinVarToBed, call it only with the --auto option. Or run the cronjob script doUpdate.sh with -nocheck to ignore this error." % (previousCount, todayCount)) - # see RM 14350 do more than just linecount - # see http://stackoverflow.com/questions/1566461/how-to-count-differences-between-two-files-on-linux - cmd = "diff %s %s --speed-large-files | grep '^[\>\<]' | wc -l" % (fname, previousFname) - diffCount = int(subprocess.check_output(cmd, shell=True, stderr=subprocess.STDOUT)) - logging.info("%s: %d lines changed" % (fname, diffCount)) - percDiff = float(diffCount)/todayCount - if percDiff > maxDiff: - logging.info(cmd) - diffFname = abspath(join("archive", "lastDiff.txt")) - cmd = "diff %s %s --speed-large-files > %s" % (fname, previousFname, diffFname) - os.system(cmd) # ret code is not 0, as there are differences, so no need to check - raise Exception("too many differences: %d out of %d lines. Look at the differences n the file %s. If you are OK with them, run '/hive/data/outside/otto/clinvar/doUpdate.sh -nocheck'" % (diffCount, todayCount, diffFname)) - -# I'm leaving this in the code, because I would not be surprised if one day we have to go back and parse -# the other files again -#def parseVariationAlleleFile(varFname): -# """ return dict allele Id -> variant ID """ -# logging.info("Parsing %s" % varFname) -# allToVar = {} -# foundHead = False -# expHead = "#VariationID Type AlleleID Interpreted" -# for line in gzip.open(varFname): -# line = line.rstrip("\n") -# if line==expHead: -# foundHead = True -# if line.startswith("#Var") and "Type" in line: -# if not line == expHead: -# print(("Header of variation_allele.txt.gz has changed again. Code needs fixing")) -# print("found:",line) -# print("expected:",expHead) -# else: -# foundHead = True -# -# if line.startswith("#"): -# continue -# if not foundHead: -# raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing") -# fields = line.split("\t") -# allToVar[fields[2]]=fields[0] -# return allToVar - -def parseHgvsFile(hgvsFname): - " return dict: allele Id -> (hgvs coding, hgvs protein) " - logging.info("Parsing %s" % hgvsFname) - expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene" - allToHgvs = {} - foundHead = False - for line in gzip.open(hgvsFname, "rt"): - line = line.rstrip("\n") - if line==expHead: - foundHead = True - if line.startswith("#Symbol") and "GeneID" in line: - if not line == expHead: - print ("Header of hgvs file has changed again. Code needs fixing") - print(("found:",line)) - print(("expected:",expHead)) - else: - foundHead = True - - if line.startswith("#"): - continue - if not foundHead: - raise Exception("Invalid Header. Code needs fixing, again.") - - fields = line.split("\t") - varType = fields[4] - if "coding" in varType and not "LRG" in fields[6]: - allToHgvs[fields[3]]=(fields[6], fields[8]) - return allToHgvs - -def accListToHtml(inStr): - """ - given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions - to HTML links to the databases - """ - newParts = [] - for part in inStr.split(","): - fields = part.split(":") - if len(fields)!=2: - newParts.append(part) - continue - - db, acc = fields - accForUrl = acc - - if db in dbToUrl: - if "OMIM" in db: - accForUrl = accForUrl.replace(".", "#") - url = (dbToUrl[db] % accForUrl) - newPart = "%s:%s" % (url, db, acc) - else: - newPart = part - newParts.append(newPart) - return ", ".join(newParts) - -def reviewStatusToHtmlStars(reviewStatus): - """ - In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this: - 1-criteria provided, conflicting interpretations - 2-criteria provided, multiple submitters, no conflicts - 1-criteria provided, single submitter - 0-no assertion criteria provided - 0-no assertion for the individual variant - 0-no assertion provided - 4-practice guideline - 3-reviewed by expert panel - - given the review status, return an html string with the correct number of black stars - """ - revStatus = reviewStatus.strip().lower().replace(" ", "") - starCount = None - - if revStatus == "nointerpretationforthesinglevariant": - starCount = 0 - if revStatus=="criteriaprovided,conflictinginterpretations": - starCount = 1 - if revStatus=="criteriaprovided,multiplesubmitters,noconflicts": - starCount = 2 - if revStatus=="criteriaprovided,singlesubmitter": - starCount = 1 - if revStatus == "practiceguideline": - starCount = 4 - if revStatus == "reviewedbyexpertpanel": - starCount = 3 - if "noassertion" in revStatus or revStatus=="-": - starCount = 0 - - if starCount == None: - print(("Illegal review status: %s" % revStatus)) - assert(False) - - emptyStarCount = 4-starCount - - htmlList = ["★"]*starCount - htmlList.extend( ["☆"]*emptyStarCount ) - - htmlStr = "".join(htmlList) - htmlStr += "  based on: %s" % reviewStatus - - asciiDesc = "%d stars" % starCount - return htmlStr, asciiDesc, starCount - -def downloadFromNcbi(skipDownload): - " download files from NCBI and return triple of filenames " - today = date.today().isoformat() - outSuffix = "%s." % today - - url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" - outFname = "archive/variant_summary.%s.txt.gz" % today - filename = outFname - - # get the new variant allele file - varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz" - varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today) - - # get the new hgvs allele file - hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz" - hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today) - - if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): - logging.info("Downlading %s to %s" % (url, outFname)) - open(outFname, "wb").write(urllib.request.urlopen(url).read()) - logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) - open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) - logging.info("Downlading %s to %s" % (varUrl, varFname)) - open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) - return filename, varFname, hgvsFname - -# ----------- MAIN -------------- -if args==[] and not options.auto: - parser.print_help() - exit(1) - -outSuffix = "" - -if options.auto: - filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload) -else: - filename = args[0] - varFname = args[1] - hgvsFname = args[2] - -#allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change -allToHgvs = parseHgvsFile(hgvsFname) - -# e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 -# GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) -# GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) -# GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant -cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') - -hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') - -# e.g. -# c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA -# c.80A>T -# c.740+356C>T -# c.3839_3855del17 -posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') - -ifh = gzip.open(filename, "rt") - -# check the header -line1 = ifh.readline() -if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again - logging.error("ClinVar has changed their header line again") - logging.error("current header line: %s" % line1.replace("\t","|")) - logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) - raise Exception("code needs fixing") - -# open out files -hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w") -hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w") -hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w") -hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w") - -longCount = 0 -noAssCount = 0 - -# convert lines -for line in ifh: - line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions - line = line.rstrip("\n") - row = line.split("\t") - row = [f.replace("\0", "") for f in row ] - - alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ - irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ - refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ - submCategories, varId = row - - # changed in July 2018 when the "varId" was merged into this table. It used to be - # in allToVar - if varId=="": - logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) - - hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) - - if chrom=="" or assembly=="" or assembly=="NCBI36": - noAssCount += 1 - continue - - if chrom=="Un" and assembly=="GRCh38": - print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) - continue - chrom = "chr"+chrom - - if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? - if assembly=="GRCh37": - chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 - else: - chrom = "chrM" - - shortName, longName = shortenName(name) - - if len(shortName)>20: - shortName = shortName[:17]+"..." - longCount+=1 - - if len(longName)>60: - longName = longName[:60]+"..." - - if start=="" or end=="": - print(("undefined start or end coordinate. record %s"% irvcAcc)) - continue - - if int(start)<0 or int(end)<0: - print(("illegal start or end coordinate. record %s"% irvcAcc)) - continue - - # sometimes they seem to inverse their coordinates - if int(start)>int(end): - start, end = end, start - - if chrom=="chr17" and (end=="217748468" or end=="84062405"): - print("blacklisted feature") - continue - - # reorder columns for bigBed - start = str(int(start)-1) # convert to zero-based, half-open - score = "0" - strand = "." - thickStart = start - thickEnd = end - itemRgb = "0" - blockCount = "1" - blockSizes = str(int(end)-int(start)) - blockStarts = "0" - - # used until Oct 2017 - # if dbVarAcc.startswith("nsv") or "copy number" in allType: - # now using just size on genome, see #20268 - if (int(end)-int(start))>100: - isCnv = True - else: - isCnv = False - - if isCnv: - if "gain" in allType or "duplication" in allType: - itemRgb = "0,0,255" - if "deletion" in allType or "loss" in allType: - itemRgb = "255,0,0" - else: - isPat, isBen = False, False - if "pathogenic" in clinSign.lower(): - isPat = True - if "benign" in clinSign.lower(): - isBen = True - - if (isPat==True and isBen==True) or (isPat==False and isBen==False): - itemRgb = "128,128,128" - elif isPat==True: - itemRgb = "210,0,0" - elif isBen==True: - itemRgb = "0,210,0" - else: - assert(False)# should never happen - - pathoCode = clinSignToPathoCode(clinSign) - varLen = int(end)-int(start) - - geneStr = geneSymbol - if geneId.isdigit() and geneId!="-1": - geneStr = geneId+"|"+geneSymbol - if len(geneStr)>250: - geneStr = "not shown, too many genes" - - if name=="": - name = "No display name was provided by ClinVar for this variant" - shortName = "NoName" - - if len(name)>90: - name = name[:90]+"..." - name = varId+"|"+name - - if len(hgvsProt) > 90: - hgvsProt = hgvsProt[:90]+"..." - if len(hgvsCod) > 90: - hgvsCod = hgvsCod[:90]+"..." - - otherIds = accListToHtml(otherIds) - - phenotypeIds = accListToHtml(phenotypeIds) - - starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) - - mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"] - mouseOver = ", ".join(mouseOverParts) - - row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ - blockCount, blockSizes, blockStarts, - name, clinSign, starRatingHtml, allType, geneStr, - snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, - hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)] - - # replace clinvar's placeholders with real empty fields - newRow = [] - for x in row: - if x in ["-1", "-"]: - newRow.append("") - else: - newRow.append(x) - row = newRow - - if assembly=="GRCh37": - ofh = hg19Bed - if isCnv: - ofh = hg19BedCnv - elif assembly=="GRCh38": - ofh = hg38Bed - if isCnv: - ofh = hg38BedCnv - else: - noAssCount +=1 - - ofh.write("\t".join(row)) - ofh.write("\n") - -hg19Bed.close() -hg38Bed.close() -hg19BedCnv.close() -hg38BedCnv.close() - -logging.info("%d lines with feature name that was too long, shortened them" % longCount) -logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) - -fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name] - -# sort the files -for fname in fnames: - cmd = "bedSort %s %s" % (fname, fname) - logging.debug(cmd) - assert(os.system(cmd)==0) - -# check new bed files against last months's bed files -if options.maxDiff: - for fname in fnames: - compareWithLastMonth(fname, options.maxDiff) - -# convert to bigBed without the date in the fname, like clinvarMain.hg19.bb -tmpFnames = [] -for fname in fnames: - parts = fname.split(".") - if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed - base, db, date, ext = parts - else: - base, db, ext = parts - - outFname = "%s.%s.bb" % (base, db) - tmpFname = "%s.%s.temp.bb" % (base, db) - - outFname = basename(outFname) # strip the archive/ part, put into current dir - cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) - mustRun(cmd) - - tableName = outFname.split('.')[0] # e.g. clinvarMain - - tmpFnames.append( (db, tableName, tmpFname, outFname) ) - -# make it atomic by renaming the temp files to the final file names at the very end -for db, tableName, tmpFname, finalFname in tmpFnames: - os.rename(tmpFname, finalFname) - -# make sure that the "Data last updated" in the table browser is correct -for db, tableName, tmpFname, finalFname in tmpFnames: - cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName) - mustRun(cmd) - -print("ClinVar update end: OK")