410ca755b501b9c84b618ef8e49b9208cff3d145 max Fri Jan 31 05:40:19 2020 -0800 oops, previous commit was into wrong directory diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 891b086..ee04313 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -82,30 +82,82 @@ shortName = "dup" else: posMatch = posRe.match(name) if posMatch!=None: pos, dupDelIns, change = posMatch.groups() if dupDelIns==None: dupDelIns= "" if change==None or change=="": shortName = pos+dupDelIns else: shortName = dupDelIns+change return shortName, longName +def clinSignToPathoCode(pathoStr): + """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes: + - benign = BN + likely benign = LB + conflicting = CF + likely pathogenic= LP + pathogenic = PG + other = OT + uncertain = UC + + examples: + + Benign/Likely benign,other 13 0.002% + Conflicting interpretations of pathogenicity,risk factor 13 0.002% + Likely benign,other 18 0.003% + Pathogenic,drug response 28 0.005% + protective 32 0.005% + Pathogenic,risk factor 33 0.005% + Pathogenic,other 102 0.017% + Affects 120 0.019% + association 202 0.033% + drug response 387 0.063% + risk factor 444 0.072% + no interpretation for the single variant 564 0.092% + other 1687 0.274% + Pathogenic/Likely pathogenic 5209 0.846% + not provided 8860 1.440% + Benign/Likely benign 19489 3.167% + Likely pathogenic 30200 4.907% + Conflicting interpretations of pathogenicity 30910 5.023% + Pathogenic 66679 10.835% + Benign 77551 12.602% + Likely benign 154870 25.166% + Uncertain significance 217838 35.399% + + """ + pathoStr = pathoStr.lower() + # don't change the order of these unless you tested it. The order is crucial. + if "conflicting" in pathoStr: + return "CF" + if "likely pathogenic" in pathoStr: + return "LP" + if "likely benign" in pathoStr: + return "LB" + if "uncertain" in pathoStr: + return "UC" + if "pathogenic" in pathoStr: + return "PG" + if "benign" in pathoStr: + return "BN" + return "OT" + def lastMonth(d,x=1): """ returns same day as this month, but last month, e.g. for 2013-01-01 return 2012-12-01 """ days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31] newmonth = ((( d.month - 1) - x ) % 12 ) + 1 newyear = d.year + ((( d.month - 1) - x ) // 12 ) if d.day > days_of_month[newmonth-1]: newday = days_of_month[newmonth-1] else: newday = d.day return datetime( newyear, newmonth, newday) def compareWithLastMonth(fname, maxDiff): @@ -162,31 +214,31 @@ # # if line.startswith("#"): # continue # if not foundHead: # raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing") # fields = line.split("\t") # allToVar[fields[2]]=fields[0] # return allToVar def parseHgvsFile(hgvsFname): " return dict: allele Id -> (hgvs coding, hgvs protein) " logging.info("Parsing %s" % hgvsFname) expHead = "#Symbol GeneID VariationID AlleleID Type Assembly NucleotideExpression NucleotideChange ProteinExpression ProteinChange UsedForNaming Submitted OnRefSeqGene" allToHgvs = {} foundHead = False - for line in gzip.open(hgvsFname): + for line in gzip.open(hgvsFname, "rt"): line = line.rstrip("\n") if line==expHead: foundHead = True if line.startswith("#Symbol") and "GeneID" in line: if not line == expHead: print ("Header of hgvs file has changed again. Code needs fixing") print(("found:",line)) print(("expected:",expHead)) else: foundHead = True if line.startswith("#"): continue if not foundHead: raise Exception("Invalid Header. Code needs fixing, again.") @@ -276,35 +328,35 @@ url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz" outFname = "archive/variant_summary.%s.txt.gz" % today filename = outFname # get the new variant allele file varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz" varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today) # get the new hgvs allele file hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz" hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today) if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): logging.info("Downlading %s to %s" % (url, outFname)) - open(outFname, "w").write(urllib.request.urlopen(url).read()) + open(outFname, "wb").write(urllib.request.urlopen(url).read()) logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) - open(hgvsFname, "w").write(urllib.request.urlopen(hgvsUrl).read()) + open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) logging.info("Downlading %s to %s" % (varUrl, varFname)) - open(varFname, "w").write(urllib.request.urlopen(varUrl).read()) + open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) return filename, varFname, hgvsFname # ----------- MAIN -------------- if args==[] and not options.auto: parser.print_help() exit(1) outSuffix = "" if options.auto: filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload) else: filename = args[0] varFname = args[1] hgvsFname = args[2] @@ -315,31 +367,31 @@ # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445) # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106) # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*') hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)') # e.g. # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA # c.80A>T # c.740+356C>T # c.3839_3855del17 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$') -ifh = gzip.open(filename) +ifh = gzip.open(filename, "rt") # check the header line1 = ifh.readline() if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again logging.error("ClinVar has changed their header line again") logging.error("current header line: %s" % line1.replace("\t","|")) logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|")) raise Exception("code needs fixing") # open out files hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w") hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w") hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w") hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w") @@ -362,31 +414,30 @@ # in allToVar if varId=="": logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc)) hgvsCod, hgvsProt = allToHgvs.get(alleleId, ("", "")) if chrom=="" or assembly=="" or assembly=="NCBI36": noAssCount += 1 continue if chrom=="Un" and assembly=="GRCh38": print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc)) continue chrom = "chr"+chrom - # Code-review/QA: Is it OK to pass through coordinates on chrM for hg19? if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ? if assembly=="GRCh37": chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 else: chrom = "chrM" shortName, longName = shortenName(name) if len(shortName)>20: shortName = shortName[:17]+"..." longCount+=1 if len(longName)>60: longName = longName[:60]+"..." @@ -434,63 +485,66 @@ isPat, isBen = False, False if "pathogenic" in clinSign.lower(): isPat = True if "benign" in clinSign.lower(): isBen = True if (isPat==True and isBen==True) or (isPat==False and isBen==False): itemRgb = "128,128,128" elif isPat==True: itemRgb = "210,0,0" elif isBen==True: itemRgb = "0,210,0" else: assert(False)# should never happen + pathoCode = clinSignToPathoCode(clinSign) + varLen = int(end)-int(start) + geneStr = geneSymbol if geneId.isdigit() and geneId!="-1": geneStr = geneId+"|"+geneSymbol if len(geneStr)>250: geneStr = "not shown, too many genes" if name=="": name = "No display name was provided by ClinVar for this variant" shortName = "NoName" if len(name)>90: name = name[:90]+"..." name = varId+"|"+name if len(hgvsProt) > 90: hgvsProt = hgvsProt[:90]+"..." if len(hgvsCod) > 90: hgvsCod = hgvsCod[:90]+"..." otherIds = accListToHtml(otherIds) phenotypeIds = accListToHtml(phenotypeIds) starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus) - mouseOverParts = [longName, clinSign, asciiStars] + mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"] mouseOver = ", ".join(mouseOverParts) row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \ blockCount, blockSizes, blockStarts, name, clinSign, starRatingHtml, allType, geneStr, snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic, - hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver] + hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)] # replace clinvar's placeholders with real empty fields newRow = [] for x in row: if x in ["-1", "-"]: newRow.append("") else: newRow.append(x) row = newRow if assembly=="GRCh37": ofh = hg19Bed if isCnv: ofh = hg19BedCnv elif assembly=="GRCh38":