70dea5ffaf352ada83e54b03aa0774bf515cbb65 max Mon Sep 21 03:39:46 2020 -0700 adding the Benet-Pages colorscheme to clinvar, fixing up the 'g.' names and auto-loading the clinvarSub table for Braney's new bigLolli clinvar submissions track. Also fixing the categorization of submissions. refs #25207, #25405 and emails with Chris/Braney. diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index c3b479d..de72ff2 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -1,18 +1,19 @@ #!/usr/bin/env python3 import logging, sys, optparse, re, os, datetime, gzip, urllib.request, urllib.error, urllib.parse, subprocess +import tempfile from collections import defaultdict from os.path import join, basename, dirname, isfile, abspath from datetime import date, datetime, timedelta dbToUrl = { "dbVar": "https://www.ncbi.nlm.nih.gov/dbvar/variants/%s/", "UniProtKB (variants)" : "http://www.uniprot.org/uniprot/%s", "OMIM Allelic Variant" : "http://www.omim.org/entry/%s", "MedGen": "https://www.ncbi.nlm.nih.gov/medgen/%s", "OMIM" : "http://www.omim.org/entry/%s", "Orphanet" : "http://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=%s" } # since we're filtering on it, we make sure that we have all molecular consequences in the tdb file # if they add a new one, this script must fail and tdb must be updated. @@ -25,33 +26,120 @@ input file is ftp://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz """) parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages") parser.add_option("-a", "--auto", dest="auto", action="store_true", help="download the file from NCBI into the current dir and convert to bigBed") parser.add_option("", "--skipDown", dest="skipDownload", action="store_true", help="Only with --auto: don't download again if it's already there, useful when debugging/developing") parser.add_option("", "--maxDiff", dest="maxDiff", action="store", type="float", help="look for last month's download file in current dir and accept this much difference, expressed as a ratio. Can only be used with --auto.") (options, args) = parser.parse_args() if options.debug: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) -clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID\n" +clinvarExpectHeaders = "#AlleleID Type Name GeneID GeneSymbol HGNC_ID ClinicalSignificance ClinSigSimple LastEvaluated RS# (dbSNP) nsv/esv (dbVar) RCVaccession PhenotypeIDS PhenotypeList Origin OriginSimple Assembly ChromosomeAccession Chromosome Start Stop ReferenceAllele AlternateAllele Cytogenetic ReviewStatus NumberSubmitters Guidelines TestedInGTR OtherIDs SubmitterCategories VariationID PositionVCF ReferenceAlleleVCF AlternateAlleleVCF\n" # ==== FUNCTIONs ===== +# benign = B, likely benign = LB, conflicting = CF, likely pathogenic= LP, +# pathogenic = P, other = OT, uncertain = VUS, RF=risk factor +# colors were defined by Ana Benet Pages +# see colortable at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing +cnvColors = { + "INS" : { + "OT" : "225,165,0", + "B" : "250,214,69", + "LB" : "250,214,69", + "CF" : "225,165,0", + "RF" : "225,165,0", + "VUS" : "225,165,0", + "LP" : "199,114,3", + "P" : "199,114,3" + }, + "LOSS" : { + "OT" : "255,0,0", + "B" : "255,98,119", + "LB" : "255,98,119", + "CF" : "255,0,0", + "VUS" : "255,0,0", + "RF" : "255,0,0", + "LP" : "153,0,0", + "P" : "153,0,0" + }, + "GAIN" : { + "OT" : "0,0,225", + "B" : "77,184,255", + "LB" : "77,184,255", + "CF" : "0,0,225", + "VUS" : "0,0,225", + "RF" : "0,0,225", + "LP" : "0,0,179", + "P" : "0,0,179" + }, + "STRUCT" : { + "OT" : "0,210,0", + "B" : "0,255,0", + "LB" : "0,255,0", + "CF" : "0,210,0", + "VUS" : "0,210,0", + "RF" : "0,210,0", + "LP" : "0,128,0", + "P" : "0,128,0" + }, + "INV" : { + "OT" : "192,0,192", + "B" : "255,128,255", + "LB" : "255,128,255", + "CF" : "192,0,192", + "VUS" : "192,0,192", + "RF" : "192,0,192", + "LP" : "128,0,128", + "P" : "128,0,128" + }, + "SEQALT" : { + "OT" : "128,128,128", + "B" : "191,191,191", + "LB" : "191,191,191", + "CF" : "128,128,128", + "VUS" : "128,128,128", + "RF" : "128,128,128", + "LP" : "64,64,64", + "P" : "64,64,64" + }, + "SEQLEN" : { + "OT" : "0,179,179", + "B" : "0,255,255", + "LB" : "0,255,255", + "CF" : "0,179,179", + "VUS" : "0,179,179", + "RF" : "0,179,179", + "LP" : "0,102,102", + "P" : "0,102,102" + }, + "SUBST" : { + "OT" : "128,128,128", + "B" : "191,191,191", + "LB" : "191,191,191", + "CF" : "128,128,128", + "VUS" : "128,128,128", + "RF" : "128,128,128", + "LP" : "64,64,64", + "P" : "64,64,64" + }, +} + def mustRun(cmd): logging.debug(cmd) ret = os.system(cmd) if ret!=0: print(("Could not run: %s" % cmd)) sys.exit(1) def shortenName(name): "make the clinvar name shorter and keep a long version for the mouse over " cnvMatch = cnvRe.match(name) #hgvsMatch = hgvsRe.match(name) if ":" in name and not "chr" in name: longName = name.split(":")[0] else: @@ -63,96 +151,120 @@ # some names include the original assembly as a prefix band, copy = cnvMatch.groups() if copy == None: copy = "" shortName = band+copy elif name.startswith("GRCh"): # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant print(name) assert False # name that starts with GRCh but does not match our regex should not happen elif "." in name: # many names look like NM_000274.3(OAT):c.550_552delGCT (p.Ala184del) # so we remove the gene and the protein description if ":" in name: name = name.split(":")[1] longName = name.split(" ")[0] + + # handle genomic HGVS NC_000010.11:g.(?_87925503)_(87965482_?)del + if name.startswith("NC_") and "g." in name and ")" in name: + shortName = name + else: + # strip away the final description like " (p.Ala184del)" name = name.split(" ")[0].split("[")[0].split("(")[0] shortName = name if name.endswith("del"): shortName = "del" elif name.endswith("ins"): shortName = "ins" elif name.endswith("dup"): shortName = "dup" else: posMatch = posRe.match(name) if posMatch!=None: pos, dupDelIns, change = posMatch.groups() if dupDelIns==None: dupDelIns= "" if change==None or change=="": shortName = pos+dupDelIns else: shortName = dupDelIns+change return shortName, longName -def allTypeToCode(allType): +def allTypeToCode(allType, debugId): """ Map the allele type field from ClinVal to one of these classes for filtering: - SNV = single nucleotide variant - INS = insertion - DEL = deletion - INDEL = indel - DUPL = duplication - OTH = other + STRUCT = structual alteration + LOSS = deletion, copy number loss, mobile element deletion + GAIN = duplication, tandem dupl, intrachrom transp, copy number gain + INS = insertion, mobile or novel element insertion + INV = inversion + SEQALT = catch-all category: multiallele cnv, undetermined, others, not defined + SEQLEN = short tandem repeat, short repeat, rep exp, rep contraction, NT expansion + SUBST = SNV, MNV, multi nucl polymophrism, indel + OTH = protein only + + # categories were defined by Ana Benet Pages + # see table at https://drive.google.com/file/d/1-iqHrZTwT1wuatNA_A5b6knyqE6JG14f/view?usp=sharing In 3/2020, these were all possible values: fusion 6 0.000% complex 61 0.005% protein only 103 0.008% NT expansion 142 0.011% Translocation 302 0.022% inversion 583 0.043% undetermined variant 866 0.064% insertion 7133 0.530% indel 8490 0.630% short repeat 18778 1.394% duplication 34148 2.535% copy number loss 40441 3.002% copy number gain 40671 3.019% deletion 77325 5.740% single nucleotide variant 1118033 82.997% """ - if allType=="single nucleotide variant": - return "SNV" + allType = allType.lower() # they changed the casing in Sep 2020 + # doing this first, as this is 80% of the data -> faster + if allType in ["single nucleotide variant", "variation"]: + # 'variation' doesn't fit anywhere, but we think that these are mostly OMIM imports, so SNV sounds OK + return "SUBST" + # the order here matches Ana's coloring table for easier reading + elif allType in ["translocation", "fusion", "complex"]: + return "STRUCT" elif allType in ["deletion", "copy number loss"]: - return "DEL" + return "LOSS" elif allType in ["duplication", "copy number gain", "tandem duplication"]: - return "DUPL" - elif allType=="indel": - return "INDEL" - elif allType in ["insertion", "NT expansion", "short repeat"]: + return "GAIN" + elif allType in ["indel", "insertion"]: return "INS" - elif allType in ["Translocation", "fusion", "complex", "protein only", "inversion", "undetermined variant"]: - return "OTH" + elif allType in ["inversion"]: + return "INV" + elif allType in ["undetermined variant"]: + return "SEQALT" + elif allType in ["short repeat", "nt expansion", "microsatellite"]: + return "SEQLEN" + #elif allType in ["variation"]: # appeared in Sep 2020 + #return "OTH" + #elif allType in ["protein only"]: # looks like they removed protein only sometimes in early 2020 + #return "OTH" else: - print("found unknown allele type: "+allType) + print("found unknown allele type '%s' for accession %s"%(allType, debugId)) assert(False) # found new allele type in ClinVar table : code needs fixing def originSimpleToCode(originSimple): """ for filtering, group the simplified origins into simpler origin codes: return one of these values: "GERM" = germline "SOM" = somatic "GERMSOM" = germline/somatic "NOVO" = de novo "UNK" = unknown as of 3/2020, these are the values in originSimple and how often they appear: tested-inconclusive 91 0.007% @@ -203,42 +315,47 @@ no interpretation for the single variant 564 0.092% other 1687 0.274% Pathogenic/Likely pathogenic 5209 0.846% not provided 8860 1.440% Benign/Likely benign 19489 3.167% Likely pathogenic 30200 4.907% Conflicting interpretations of pathogenicity 30910 5.023% Pathogenic 66679 10.835% Benign 77551 12.602% Likely benign 154870 25.166% Uncertain significance 217838 35.399% """ pathoStr = pathoStr.lower() # don't change the order of these unless you tested it. The order is crucial. + # the aim of this order is to err on the side of caution: if a variant is VUS and pathogenic, + # we rather classify it as VUS (=triggering manual review). If it's pathogenic and benign at the same + # time, we rather classify it as pathogenic if "conflicting" in pathoStr: return "CF" + if "uncertain" in pathoStr: + return "VUS" + if "risk factor" in pathoStr: + return "RF" if "likely pathogenic" in pathoStr: return "LP" + if "pathogenic" in pathoStr: + return "P" if "likely benign" in pathoStr: return "LB" - if "uncertain" in pathoStr: - return "UC" - if "pathogenic" in pathoStr: - return "PG" if "benign" in pathoStr: - return "BN" + return "B" return "OT" def lastMonth(d,x=1): """ returns same day as this month, but last month, e.g. for 2013-01-01 return 2012-12-01 """ days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] newmonth = ((( d.month - 1) - x ) % 12 ) + 1 newyear = d.year + ((( d.month - 1) - x ) // 12 ) if d.day > days_of_month[newmonth-1]: newday = days_of_month[newmonth-1] else: newday = d.day return datetime( newyear, newmonth, newday) @@ -323,51 +440,56 @@ if line.startswith("#"): continue if not foundHead: raise Exception("Invalid Header. Code needs fixing, again.") fields = line.split("\t") varType = fields[4] if "coding" in varType and not "LRG" in fields[6]: allToHgvs[fields[3]]=(fields[6], fields[8]) return allToHgvs def accListToHtml(inStr): """ given a string of a list of db:acc tuples, like "dbVar:nssv578901, omim:12343", convert the accessions - to HTML links to the databases + to HTML links to the databases. Also, pull out the dbVar SSV accession for its own field later. """ + dbVarAcc = "" newParts = [] for part in inStr.split(","): fields = part.split(":") if len(fields)!=2: newParts.append(part) continue db, acc = fields accForUrl = acc + if db=="dbVar" and len(acc)>5 and acc[1:4]=="ssv": + # can be nssv/essv/essv, see https://www.ncbi.nlm.nih.gov/dbvar/content/faq/#nsvnssv + dbVarAcc = acc + if db in dbToUrl: if "OMIM" in db: accForUrl = accForUrl.replace(".", "#") url = (dbToUrl[db] % accForUrl) newPart = "%s:%s" % (url, db, acc) else: newPart = part newParts.append(newPart) - return ", ".join(newParts) + return (", ".join(newParts)), dbVarAcc def reviewStatusToHtmlStars(reviewStatus): """ In email from Donna maglott@ncbi.nlm.nih.gov mapping is like this: 1-criteria provided, conflicting interpretations 2-criteria provided, multiple submitters, no conflicts 1-criteria provided, single submitter 0-no assertion criteria provided 0-no assertion for the individual variant 0-no assertion provided 4-practice guideline 3-reviewed by expert panel given the review status, return an html string with the correct number of black stars """ @@ -395,65 +517,89 @@ emptyStarCount = 4-starCount htmlList = ["★"]*starCount htmlList.extend( ["☆"]*emptyStarCount ) htmlStr = "".join(htmlList) htmlStr += "  based on: %s" % reviewStatus if starCount==1: asciiDesc = "%d star" % starCount else: asciiDesc = "%d stars" % starCount return htmlStr, asciiDesc, starCount +def loadSubmissions(summFname): + " load the submissions into the Mysql table for the lollipop track details page " + # not doing the load+rename strategy yet, I don't think it makes a difference here + logging.info("Loading submissions table %s into mysql, hg19 and also hg38" % summFname) + fh = tempfile.NamedTemporaryFile(prefix="clinVarToBed", suffix=".tsv") + tmpFname = fh.name + cmd= "zgrep -v '^#' %s > %s" % (summFname, tmpFname) + assert(os.system(cmd)==0) + + cmd = "chmod a+r %s" % (tmpFname) # otherwise mysqld can't read the file + assert(os.system(cmd)==0) + + cmd = "hgLoadSqlTab hg19 clinvarSub clinvarSub.sql %s" % (tmpFname) + assert(os.system(cmd)==0) + + cmd = "hgLoadSqlTab hg38 clinvarSub clinvarSub.sql %s" % (tmpFname) + assert(os.system(cmd)==0) + def downloadFromNcbi(skipDownload): " download files from NCBI and return triple of filenames " today = date.today().isoformat() outSuffix = "%s." % today baseUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/" url = baseUrl+"variant_summary.txt.gz" outFname = "archive/variant_summary.%s.txt.gz" % today filename = outFname # get the new variant allele file varUrl = baseUrl+"variation_allele.txt.gz" varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today) # get the new hgvs allele file hgvsUrl = baseUrl+"hgvs4variation.txt.gz" hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today) + # get the summary file + summUrl = baseUrl+"submission_summary.txt.gz" + summFname = join(dirname(filename), "submission_summary.%s.txt.gz" % today) + # get the new VCF file vcfUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/vcf_GRCh38/clinvar.vcf.gz" vcfFname = join(dirname(filename), "GRCh38.clinvar.%s.vcf.gz" % today) if not (isfile(vcfFname)): logging.info("Downlading %s to %s" % (vcfUrl, vcfFname)) open(vcfFname, "wb").write(urllib.request.urlopen(vcfUrl).read()) - if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): + if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname) and isfile(summFname)): logging.info("Downlading %s to %s" % (url, outFname)) open(outFname, "wb").write(urllib.request.urlopen(url).read()) logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) logging.info("Downlading %s to %s" % (varUrl, varFname)) open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) + logging.info("Downlading %s to %s" % (summUrl, summFname)) + open(summFname, "wb").write(urllib.request.urlopen(summUrl).read()) - return filename, varFname, hgvsFname, vcfFname + return filename, varFname, hgvsFname, vcfFname, summFname def parseVcf(vcfFname): " parse VCF and return as dict alleleId -> (molConseq) " logging.info("Parsing %s" % vcfFname) ret = {} for line in gzip.open(vcfFname, "rt", encoding="utf_8"): # #CHROM POS ID REF ALT QUAL FILTER INFO #1 930248 789256 G A . . ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS= if line.startswith("#"): continue chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t") infoParts = info.split(";") alleleId = None mc = None @@ -469,45 +615,46 @@ sys.exit(1) if mc and alleleId: ret[int(alleleId)] = mc return ret # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: - filename, varFName, hgvsFname, vcfFname = downloadFromNcbi(options.skipDownload) + filename, varFName, hgvsFname, vcfFname, summFname = downloadFromNcbi(options.skipDownload) else: filename = args[0] varFname = args[1] hgvsFname = args[2] vcfFname = args[3] #allToVar = parseVariationAlleleFile(varFname) # the alleles are now in the hgvs file after a Clinvar change allToVcf = parseVcf(vcfFname) allToHgvs = parseHgvsFile(hgvsFname) # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant +# NC_000010.11:g.(?_87925503)_(87965482_?)del cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') ifh = gzip.open(filename, "rt", encoding="latin_1") # check the header line1 = ifh.readline() @@ -525,31 +672,31 @@ longCount = 0 noAssCount = 0 noMcCount = 0 # convert lines for line in ifh: line = line.replace(", ", ",").replace(";", ",") # replace, bigBed conventions line = line.rstrip("\n") row = line.split("\t") row = [f.replace("\0", "") for f in row ] alleleId, allType, name, geneId, geneSymbol, hgncId, clinSign, clinSignSimple, lastEval, snpAcc, dbVarAcc, \ irvcAcc, phenotypeIds, phenotypeList, origin, originSimple, assembly, chromAcc, chrom, start, end, \ refAll, varAll, cytogenetic, reviewStatus, numberSubmitters, guidelines, inGtr, otherIds, \ - submCategories, varId = row + submCategories, varId, posVcf, refAllVcf, altAllVcf = row # changed in July 2018 when the "varId" was merged into this table. It used to be # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) molConseq = allToVcf.get(int(alleleId)) if molConseq is None: logging.debug("%s has no molConseq" % alleleId) noMcCount += 1 molConseq = "" if chrom=="" or assembly=="" or assembly=="NCBI36": @@ -565,35 +712,35 @@ if assembly=="GRCh37": chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 else: chrom = "chrM" shortName, longName = shortenName(name) if len(shortName)>20: shortName = shortName[:17]+"..." longCount+=1 if len(longName)>60: longName = longName[:60]+"..." if start=="" or end=="": - print(("undefined start or end coordinate. record %s"% irvcAcc)) + print("empty start or end coordinate. record %s, start/end = %s/%s "% (irvcAcc, start, end)) continue if int(start)<0 or int(end)<0: - print(("illegal start or end coordinate. record %s"% irvcAcc)) + print("illegal start or end coordinate. record %s, start/end = %s/%s"% (irvcAcc, start, end)) continue # sometimes they seem to inverse their coordinates if int(start)>int(end): start, end = end, start if chrom=="chr17" and (end=="217748468" or end=="84062405"): print("blacklisted feature") continue # reorder columns for bigBed start = str(int(start)-1) # convert to zero-based, half-open score = "0" strand = "." thickStart = start @@ -602,117 +749,128 @@ blockCount = "1" blockSizes = str(int(end)-int(start)) blockStarts = "0" # used until Oct 2017 # if dbVarAcc.startswith("nsv") or "copy number" in allType: # now using just size on genome, see #20268 # shortening to >= 50bp to bring into line with LOVD and gnomAD if (int(end)-int(start))>=50: isCnv = True else: isCnv = False pathoCode = clinSignToCode(clinSign) originCode = originSimpleToCode(originSimple) - allTypeCode = allTypeToCode(allType) + allTypeCode = allTypeToCode(allType, "|".join(row)) if isCnv: - if "gain" in allType or "duplication" in allType or "insertion" in allType: - itemRgb = "0,0,255" - if "deletion" in allType or "loss" in allType: - itemRgb = "255,0,0" + typeColors = cnvColors[allTypeCode] + #else: + #print("CNV encountered with unknown type: %s / %s"%(allType, allTypeCode)) + #assert(False) + itemRgb = typeColors.get(pathoCode) + if itemRgb is None: + print("CNV encountered with unknown pathoCode: "+pathoCode) + assert(False) + else: - if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic + if pathoCode in ["P", "LP"]: # pathogenic or likely pathogenic itemRgb = "210,0,0" # red - elif pathoCode in ["BN", "LB"]: # benign or likely benign + elif pathoCode in ["B", "LB"]: # benign or likely benign itemRgb = "0,210,0" # green - elif pathoCode in ["UC"]: # uncertain + elif pathoCode in ["VUS", "RF"]: # uncertain itemRgb = "0,0,128" # dark-blue elif pathoCode in ["CF"]: # conflicting itemRgb = "137,121,212" # light-blue elif pathoCode in ["OT"]: # other itemRgb = "128,128,128" # grey else: assert(False)# should never happen varLen = int(end)-int(start) geneStr = geneSymbol if geneId.isdigit() and geneId!="-1": geneStr = geneId+"|"+geneSymbol if len(geneStr)>250: geneStr = "not shown, too many genes" mouseOverName = name - if name=="": + otherIds, dbVarSsvAcc = accListToHtml(otherIds) + + if isCnv and dbVarSsvAcc!="": + shortName = dbVarSsvAcc + elif name=="": name = "No display name was provided by ClinVar for this variant" shortName = "NoName" mouseOverName = "" if len(name)>90: name = name[:90]+"..." name = varId+"|"+name if len(mouseOverName)>80: mouseOverName = mouseOverName[:80]+"..." if len(hgvsProt) > 90: hgvsProt = hgvsProt[:90]+"..." if len(hgvsCod) > 90: hgvsCod = hgvsCod[:90]+"..." - otherIds = accListToHtml(otherIds) - - phenotypeIds = accListToHtml(phenotypeIds) + phenotypeIds, _ = accListToHtml(phenotypeIds) starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) mouseOverParts = [mouseOverName, "Type: "+allType, "Consequence: "+ molConseq, "Significance: "+clinSign, "Origin: "+origin, "Phenotypes: "+phenotypeList] # removed Apr 2020: numberSubmitters+" submitters", "Level: "+asciiStars mouseOver = ", ".join(mouseOverParts) row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, molConseq, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, # these fields are the for the filters, not for the display - pathoCode, originCode, allTypeCode, str(varLen), str(starCount)] + pathoCode, originCode, allTypeCode, str(varLen), str(starCount), + # the variant ID got forgotten in the original bigBed schema, it was only part of the origName field + varId, dbVarSsvAcc] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38": ofh = hg38Bed if isCnv: ofh = hg38BedCnv else: noAssCount +=1 + #for o in row: + #print(o, type(o)) ofh.write("\t".join(row)) ofh.write("\n") hg19Bed.close() hg38Bed.close() hg19BedCnv.close() hg38BedCnv.close() logging.info("%d lines with feature name that was too long, shortened them" % longCount) logging.info("%d lines without, with old assembly or no chrom, skipped" % noAssCount) fnames = [hg19Bed.name, hg38Bed.name, hg19BedCnv.name, hg38BedCnv.name] # sort the files for fname in fnames: @@ -726,32 +884,34 @@ compareWithLastMonth(fname, options.maxDiff) # convert to bigBed without the date in the fname, like clinvarMain.hg19.bb tmpFnames = [] for fname in fnames: parts = fname.split(".") if len(parts) == 4: # clinvarMain.hg19.2011-11-11.bed base, db, date, ext = parts else: base, db, ext = parts outFname = "%s.%s.bb" % (base, db) tmpFname = "%s.%s.temp.bb" % (base, db) outFname = basename(outFname) # strip the archive/ part, put into current dir - cmd = "bedToBigBed -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) + cmd = "bedToBigBed -extraIndex=_dbVarSsvId -tab -type=bed9+ -as=clinvar.as %s /hive/data/genomes/%s/chrom.sizes %s" % (fname, db, tmpFname) mustRun(cmd) tableName = outFname.split('.')[0] # e.g. clinvarMain tmpFnames.append( (db, tableName, tmpFname, outFname) ) # make it atomic by renaming the temp files to the final file names at the very end for db, tableName, tmpFname, finalFname in tmpFnames: os.rename(tmpFname, finalFname) # make sure that the "Data last updated" in the table browser is correct for db, tableName, tmpFname, finalFname in tmpFnames: cmd = "hgBbiDbLink %s %s /gbdb/%s/bbi/clinvar/%s.bb" % (db, tableName, db, tableName) mustRun(cmd) +loadSubmissions(summFname) + print("ClinVar update end: OK")