src/hg/utils/otto/clinvar/clinVarToBed 410ca755b501b9c84b618ef8e49b9208cff3d145

410ca755b501b9c84b618ef8e49b9208cff3d145
max
  Fri Jan 31 05:40:19 2020 -0800
oops, previous commit was into wrong directory

diff --git src/hg/utils/otto/clinvar/clinVarToBed src/hg/utils/otto/clinvar/clinVarToBed
index 891b086..ee04313 100755
--- src/hg/utils/otto/clinvar/clinVarToBed
+++ src/hg/utils/otto/clinvar/clinVarToBed
@@ -82,30 +82,82 @@
             shortName = "dup"
         else:
             posMatch = posRe.match(name)
             if posMatch!=None:
                 pos, dupDelIns, change = posMatch.groups()
                 if dupDelIns==None:
                     dupDelIns= ""
 
                 if change==None or change=="":
                     shortName = pos+dupDelIns
                 else:
                     shortName = dupDelIns+change
 
     return shortName, longName
 
+def clinSignToPathoCode(pathoStr):
+    """ there are hundreds of possible pathogenicity descriptions. flatten into one of seven codes:
+    - benign = BN
+    likely benign = LB
+    conflicting = CF
+    likely pathogenic= LP
+    pathogenic = PG
+    other = OT
+    uncertain = UC
+
+    examples:
+
+    Benign/Likely benign,other      13      0.002%
+    Conflicting interpretations of pathogenicity,risk factor        13      0.002%
+    Likely benign,other     18      0.003%  
+    Pathogenic,drug response        28      0.005%
+    protective      32      0.005%
+    Pathogenic,risk factor  33      0.005%
+    Pathogenic,other        102     0.017%
+    Affects 120     0.019%
+    association     202     0.033%
+    drug response   387     0.063%
+    risk factor     444     0.072%
+    no interpretation for the single variant        564     0.092%
+    other   1687    0.274%
+    Pathogenic/Likely pathogenic    5209    0.846%
+    not provided    8860    1.440%
+    Benign/Likely benign    19489   3.167%
+    Likely pathogenic       30200   4.907%
+    Conflicting interpretations of pathogenicity    30910   5.023%
+    Pathogenic      66679   10.835%
+    Benign  77551   12.602%
+    Likely benign   154870  25.166%
+    Uncertain significance  217838  35.399%
+
+    """
+    pathoStr = pathoStr.lower()
+    # don't change the order of these unless you tested it. The order is crucial.
+    if "conflicting" in pathoStr:
+        return "CF"
+    if "likely pathogenic" in pathoStr:
+        return "LP"
+    if "likely benign" in pathoStr:
+        return "LB"
+    if "uncertain" in pathoStr:
+        return "UC"
+    if "pathogenic" in pathoStr:
+        return "PG"
+    if "benign" in pathoStr:
+        return "BN"
+    return "OT"
+
 def lastMonth(d,x=1):
     """ returns same day as this month, but last month, e.g. for 2013-01-01 
     return 2012-12-01 
     """
 
     days_of_month = [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
     newmonth = ((( d.month - 1) - x ) % 12 ) + 1
     newyear  = d.year + ((( d.month - 1) - x ) // 12 ) 
     if d.day > days_of_month[newmonth-1]:
       newday = days_of_month[newmonth-1]
     else:
       newday = d.day
     return datetime( newyear, newmonth, newday)
 
 def compareWithLastMonth(fname, maxDiff):
@@ -162,31 +214,31 @@
 #
 #        if line.startswith("#"):
 #            continue
 #        if not foundHead:
 #            raise Exception("Header of variation_allele.txt.gz has changed again. Code needs fixing")
 #        fields = line.split("\t")
 #        allToVar[fields[2]]=fields[0]
 #    return allToVar
 
 def parseHgvsFile(hgvsFname):
     " return dict: allele Id -> (hgvs coding, hgvs protein) "
     logging.info("Parsing %s" % hgvsFname)
     expHead	= "#Symbol	GeneID	VariationID	AlleleID	Type	Assembly	NucleotideExpression	NucleotideChange	ProteinExpression	ProteinChange	UsedForNaming	Submitted	OnRefSeqGene"
     allToHgvs = {}
     foundHead = False
-    for line in gzip.open(hgvsFname):
+    for line in gzip.open(hgvsFname, "rt"):
         line = line.rstrip("\n")
         if line==expHead:
             foundHead = True
         if line.startswith("#Symbol") and "GeneID" in line:
             if not line == expHead:
                 print ("Header of hgvs file has changed again. Code needs fixing")
                 print(("found:",line))
                 print(("expected:",expHead))
             else:
                 foundHead = True
 
         if line.startswith("#"):
             continue
         if not foundHead:
             raise Exception("Invalid Header. Code needs fixing, again.")
@@ -276,35 +328,35 @@
 
     url = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
     outFname = "archive/variant_summary.%s.txt.gz" % today
     filename = outFname
 
     # get the new variant allele file
     varUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variation_allele.txt.gz"
     varFname = join(dirname(filename), "variation_allele.%s.txt.gz" % today)
 
     # get the new hgvs allele file
     hgvsUrl = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/hgvs4variation.txt.gz"
     hgvsFname = join(dirname(filename), "hgvs4variation.%s.txt.gz" % today)
 
     if not (skipDownload and isfile(outFname) and isfile(varFname) and isfile(hgvsFname)):
         logging.info("Downlading %s to %s" % (url, outFname))
-        open(outFname, "w").write(urllib.request.urlopen(url).read())
+        open(outFname, "wb").write(urllib.request.urlopen(url).read())
         logging.info("Downlading %s to %s" % (hgvsUrl, hgvsFname))
-        open(hgvsFname, "w").write(urllib.request.urlopen(hgvsUrl).read())
+        open(hgvsFname, "wb").write(urllib.request.urlopen(hgvsUrl).read())
         logging.info("Downlading %s to %s" % (varUrl, varFname))
-        open(varFname, "w").write(urllib.request.urlopen(varUrl).read())
+        open(varFname, "wb").write(urllib.request.urlopen(varUrl).read())
     return filename, varFname, hgvsFname
 
 # ----------- MAIN --------------
 if args==[] and not options.auto:
     parser.print_help()
     exit(1)
 
 outSuffix = ""
 
 if options.auto:
     filename, varFName, hgvsFname = downloadFromNcbi(options.skipDownload)
 else:
     filename = args[0]
     varFname = args[1]
     hgvsFname = args[2]
@@ -315,31 +367,31 @@
 # e.g. GRCh38/hg38 1p36.33-36.31(chr1:1181847-5507243)x1
 # GRCh38/hg38 9p24.3-q34.3(chr9:204193-138179445)
 # GRCh37/hg19 21q22.13-22.2(chr21:38741104..40274106)
 # GRCh37/hg19 15q14-15.1 chr15:34638237..42057083 complex variant
 cnvRe = re.compile(r'GRCh[0-9]*/hg[0-9]* ([XY0-9pq.-]+) ?\(?[XYa-z_:0-9.-]+?\)?(x[0-9]+)?.*')
 
 hgvsRe = re.compile(r'(N[RPCGM]_[0-9.]+)(\([a-zA-Z0-9_-]+\))?:([+0-9c.ACTG>a-z_()-]+)')
 
 # e.g.
 # c.80_83delGGATinsTGCTGTAAACTGTAACTGTAAA
 # c.80A>T
 # c.740+356C>T
 # c.3839_3855del17
 posRe = re.compile(r'^([mgcpdrn]\.[*0-9_?+-]+)(delins|dup|del|ins|inv)?([>~ACTG0-9]*)$')
 
-ifh = gzip.open(filename)
+ifh = gzip.open(filename, "rt")
 
 # check the header
 line1 = ifh.readline()
 if (line1 != clinvarExpectHeaders): # test if clinvar has changed their header line again
     logging.error("ClinVar has changed their header line again")
     logging.error("current header line: %s" % line1.replace("\t","|"))
     logging.error("expected header line: %s" % clinvarExpectHeaders.replace("\t","|"))
     raise Exception("code needs fixing")
 
 # open out files
 hg19Bed = open("archive/clinvarMain.hg19.%sbed" % outSuffix, "w")
 hg38Bed = open("archive/clinvarMain.hg38.%sbed" % outSuffix, "w")
 hg19BedCnv = open("archive/clinvarCnv.hg19.%sbed" % outSuffix, "w")
 hg38BedCnv = open("archive/clinvarCnv.hg38.%sbed" % outSuffix, "w")
 
@@ -362,31 +414,30 @@
     # in allToVar
     if varId=="":
         logging.warn("empty variantID for alleleId %s, %s" % (alleleId, irvcAcc))
 
     hgvsCod, hgvsProt =  allToHgvs.get(alleleId, ("", ""))
 
     if chrom=="" or assembly=="" or assembly=="NCBI36":
         noAssCount += 1
         continue
 
     if chrom=="Un" and assembly=="GRCh38":
         print(("wrong chrUn chrom on assembly hg38. Skipping %s" % irvcAcc))
         continue
     chrom = "chr"+chrom
 
-    # Code-review/QA: Is it OK to pass through coordinates on chrM for hg19?
     if chrom=="chrMT": # why does NCBI use chrMT but we use chrM ?
         if assembly=="GRCh37":
             chrom = "chrMT" # our chrM is different from NCBI's MT, but chrMT got added hg19 in 2020 
         else:
             chrom = "chrM"
 
     shortName, longName = shortenName(name)
 
     if len(shortName)>20:
         shortName = shortName[:17]+"..."
         longCount+=1
 
     if len(longName)>60:
         longName = longName[:60]+"..."
 
@@ -434,63 +485,66 @@
         isPat, isBen = False, False
         if "pathogenic" in clinSign.lower():
             isPat = True
         if "benign" in clinSign.lower():
             isBen = True
 
         if (isPat==True and isBen==True) or (isPat==False and isBen==False):
             itemRgb = "128,128,128"
         elif isPat==True:
             itemRgb = "210,0,0"
         elif isBen==True:
             itemRgb = "0,210,0"
         else:
             assert(False)# should never happen
 
+    pathoCode = clinSignToPathoCode(clinSign)
+    varLen = int(end)-int(start)
+
     geneStr = geneSymbol
     if geneId.isdigit() and geneId!="-1":
         geneStr = geneId+"|"+geneSymbol
     if len(geneStr)>250:
         geneStr = "not shown, too many genes"
 
     if name=="":
         name = "No display name was provided by ClinVar for this variant"
         shortName = "NoName"
 
     if len(name)>90:
         name = name[:90]+"..."
     name = varId+"|"+name
 
     if len(hgvsProt) > 90:
         hgvsProt = hgvsProt[:90]+"..."
     if len(hgvsCod) > 90:
         hgvsCod = hgvsCod[:90]+"..."
 
     otherIds = accListToHtml(otherIds)
 
     phenotypeIds = accListToHtml(phenotypeIds)
 
     starRatingHtml, asciiStars, starCount = reviewStatusToHtmlStars(reviewStatus)
 
-    mouseOverParts = [longName, clinSign, asciiStars]
+    mouseOverParts = [longName, clinSign, asciiStars, "Phenotypes: "+phenotypeList, "Origin: "+origin, numberSubmitters+" submitters"]
     mouseOver = ", ".join(mouseOverParts)
 
     row = [chrom, start, end, shortName, str(starCount), strand, thickStart, thickEnd, itemRgb, \
         blockCount, blockSizes, blockStarts,
         name, clinSign, starRatingHtml, allType, geneStr,
         snpAcc, dbVarAcc, irvcAcc, inGtr, phenotypeList, phenotypeIds, origin, assembly, cytogenetic,
-        hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver]
+        hgvsCod, hgvsProt, numberSubmitters, lastEval, guidelines, otherIds, mouseOver, pathoCode, str(varLen)]
 
     # replace clinvar's placeholders with real empty fields
     newRow = []
     for x in row:
         if x in ["-1", "-"]:
             newRow.append("")
         else:
              newRow.append(x)
     row = newRow
 
     if assembly=="GRCh37":
         ofh = hg19Bed
         if isCnv:
             ofh = hg19BedCnv
     elif assembly=="GRCh38":