a85f07414441c0ad4d8310620fe5c40a33f2f5fd
chmalee
  Tue Apr 21 11:12:07 2020 -0700
Changing lovd and clinvar otto scripts, structural variant cutoff is now >= 50bp, which brings gnomad, dbVar, clinvar, and lovd into harmony

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 9586f99..c3b479d 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -435,31 +435,31 @@
 
     if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
         open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
         open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
         open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
 
     return filename, varFname, hgvsFname, vcfFname
 
 def parseVcf(vcfFname):
     " parse VCF and return as dict alleleId -> (molConseq) "
     logging.info("Parsing %s" % vcfFname)
     ret = {}
-    for line in gzip.open(vcfFname, "rt"):
+    for line in gzip.open(vcfFname, "rt", encoding="utf_8"):
         # #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
         #1       930248  789256  G       A       .       .       ALLELEID=707587;CLNDISDB=MedGen:CN517202;CLNDN=not_provided;CLNHGVS=
         if line.startswith("#"):
             continue
         chrom, start, end, ref, alt, qual, filter, info = line.rstrip("\n").split("\t")
         infoParts = info.split(";")
 
         alleleId = None
         mc = None
         for info in infoParts:
             key, val = info.split("=")
             if key=="ALLELEID":
                 alleleId = val
             elif key=="MC":
                 mc = val.split("|")[1].split(",")[0].replace("_", " ")
@@ -594,31 +594,32 @@
 
     # reorder columns for bigBed
     start = str(int(start)-1) # convert to zero-based, half-open
     score = "0"
     strand = "."
     thickStart = start
     thickEnd = end
     itemRgb = "0"
     blockCount = "1"
     blockSizes = str(int(end)-int(start))
     blockStarts = "0"
 
     # used until Oct 2017
     # if dbVarAcc.startswith("nsv") or "copy number" in allType:
     # now using just size on genome, see #20268
-    if (int(end)-int(start))>100:
+    # shortening to >= 50bp to bring into line with LOVD and gnomAD
+    if (int(end)-int(start))>=50:
         isCnv = True
     else:
         isCnv = False
 
     pathoCode = clinSignToCode(clinSign)
     originCode = originSimpleToCode(originSimple)
     allTypeCode = allTypeToCode(allType)
 
     if isCnv:
         if "gain" in allType or "duplication" in allType or "insertion" in allType:
             itemRgb = "0,0,255"
         if "deletion" in allType or "loss" in allType:
             itemRgb = "255,0,0"
     else:
         if pathoCode in ["PG", "LP"]: # pathogenic or likely pathogenic