a85f07414441c0ad4d8310620fe5c40a33f2f5fd chmalee Tue Apr 21 11:12:07 2020 -0700 Changing lovd and clinvar otto scripts, structural variant cutoff is now >= 50bp, which brings gnomad, dbVar, clinvar, and lovd into harmony diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed index 9586f99..c3b479d 100755 --- src/hg/utils/otto/clinvar/clinVarToBed +++ src/hg/utils/otto/clinvar/clinVarToBed @@ -435,31 +435,31 @@ if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)): logging.info("Downlading %s to %s" % (url, outFname)) open(outFname, "wb").write(urllib.request.urlopen(url).read()) logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname)) open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read()) logging.info("Downlading %s to %s" % (varUrl, varFname)) open(varFname, "wb").write(urllib.request.urlopen(varUrl).read()) return filename, varFname, hgvsFname, vcfFname def parseVcf(vcfFname): " parse VCF and return as dict alleleId -> (molConseq) " logging.info("Parsing %s" % vcfFname) ret = {} - for line in gzip.open(vcfFname, "rt"): + for line in gzip.open(vcfFname, "rt", encoding="utf_8"): # #CHROM POS ID REF ALT QUAL FILTER INFO #1 930248 789256 G A . . ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS= if line.startswith("#"): continue chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t") infoParts = info.split(";") alleleId = None mc = None for info in infoParts: key, val = info.split("=") if key=="ALLELEID": alleleId = val elif key=="MC": mc = val.split("|")[1].split(",")[0].replace("_", " ") @@ -594,31 +594,32 @@ # reorder columns for bigBed start = str(int(start)-1) # convert to zero-based, half-open score = "0" strand = "." thickStart = start thickEnd = end itemRgb = "0" blockCount = "1" blockSizes = str(int(end)-int(start)) blockStarts = "0" # used until Oct 2017 # if dbVarAcc.startswith("nsv") or "copy number" in allType: # now using just size on genome, see #20268 - if (int(end)-int(start))>100: + # shortening to >= 50bp to bring into line with LOVD and gnomAD + if (int(end)-int(start))>=50: isCnv = True else: isCnv = False pathoCode = clinSignToCode(clinSign) originCode = originSimpleToCode(originSimple) allTypeCode = allTypeToCode(allType) if isCnv: if "gain" in allType or "duplication" in allType or "insertion" in allType: itemRgb = "0,0,255" if "deletion" in allType or "loss" in allType: itemRgb = "255,0,0" else: if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic