4bd3913b145415c4d24d5c6b2fa9ee5b966257c5
chmalee
  Fri Jul 31 11:44:52 2020 -0700
More clinGen dosage changes after extrernal feedback, refs #24818

diff --git src/hg/utils/otto/clinGen/processClinGenDosage.py src/hg/utils/otto/clinGen/processClinGenDosage.py
index acd48c7..c40a161 100755
--- src/hg/utils/otto/clinGen/processClinGenDosage.py
+++ src/hg/utils/otto/clinGen/processClinGenDosage.py
@@ -1,180 +1,217 @@
 #!/usr/bin/env python3
 
 """
-Read GVF lines from stdin and write bed9+ lines to stdout
+Turn ClinGen gene_curation_list and region_curation_list into
+two bed files, one for haploinsufficient and one for triplosensitive
 """
 
-import os,sys,re
+import os,sys
 import argparse
 
-bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart",
-    "thickEnd", "itemRgb"]
+bed8Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart",
+    "thickEnd"]
 
 # we only use some of these fields depending on whether we are working with haplo or triplo scores
-extraFieldList = ["Gene Symbol", "Gene ID", "cytoBand", "Genomic Location",
-    "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1",
-    "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score",
-    "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2",
-    "Triplosensitivity PMID3", "Date Last Evaluated", "Loss phenotype OMIM ID",
-    "Triplosensitive phenotype OMIM ID"]
+commonFields = ["cytoBand", "Genomic Location", "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score", "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", "Triplosensitivity PMID3", "Date Last Evaluated", "OMIM ID", "OMIM Description"]
+haploFields = ["Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", "Haploinsufficiency PMID2", "Haploinsufficiency PMID3"]
+triploFields = ["Triplosensitivity Score", "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", "Triplosensitivity PMID3"]
+geneListFields = ["Gene Symbol", "Gene ID"]
+regionListFields = ["ISCA ID", "ISCA Region Name"]
+
+# the gene and region files get merged:
+combinedFields = ["Gene Symbol/ISCA ID", "Gene ID/ISCA Region Name"]
+
+clinGenGeneUrl = "https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen/clingen_gene.cgi?sym="
+clinGenIscaUrl = "https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen/clingen_region.cgi?id="
 
 scoreExplanation = {
     -1: "not yet evaluated",
     0: "no evidence for dosage pathogenicity",
     1: "little evidence for dosage pathogenicity",
     2: "some evidence for dosage pathogenicity",
     3: "sufficient evidence for dosage pathogenicity",
     30: "gene associated with autosomal recessive phenotype",
     40: "haploinsufficiency unlikely"
     }
 
-# hold all the bedlines in memory until the end as different lines have 
-# different numbers of extra fields
-bedLines = {}
-
-# the phenotypes and other stuff associated with a particular gene
-extraInfo = {}
+# the input data keyed by "Gene Symbol" or "ISCA ID", and further keyed by each field name
+inData = {}
 
 def setupCommandLine():
-    parser = argparse.ArgumentParser(description="Read bed lines from infile, join with extraFile and transform to bed9+",
+    parser = argparse.ArgumentParser(description="Transform ClinGen region and gene lists into a bed file",
         add_help=True, usage = "%(prog)s [options]")
-    parser.add_argument("infile", action="store", default=None, help="Input bed file from which to read input, use 'stdin' to read from default stdin")
-    parser.add_argument("extraFile", action="store", default=None, help="a ClinGen_gene_curation_list file with extra info for the records in the infile")
-    parser.add_argument("dosageType", action="store", default=None, help="haplo or triplo, which extraFields to use for this bed record")
+    parser.add_argument("regionFile", action="store", default=None, help="a ClinGen_region_curation_list file")
+    parser.add_argument("geneFile", action="store", default=None, help="a ClinGen_gene_curation_list file")
+    parser.add_argument("outFileBase", action="store", default=None, help="the base file name for the two output files: outFileBase.haplo.bed and outFileBase.triplo.bed will be created.")
     args = parser.parse_args()
-    if args.infile == "stdin" and args.extraFile == "stdin":
-        print("Only one of infile or extraFile can be stdin")
+    if args.regionFile == "stdin" and args.geneFile == "stdin":
+        print("Only one of regionFile or geneFile can be stdin")
         sys.exit(1)
     return args
 
-def parseExtraFile(fname, dosageType):
-    global extraInfo
-    oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss"
-    fieldMatch = re.compile(oppMatchStr,re.I)
-    with open(fname) as f:
-        for line in f:
-            if line.startswith("#"):
-                continue
-            l = line.strip("\n").split("\t")
-            geneSymbol = l[0]
-            extra = l[1:]
-            try:
-                if extraInfo[geneSymbol]:   
-                    print("duplicate info for gene symbol: '%s'" % (geneSymbol))
-                    sys.exit(1)
-            except KeyError:
-                extraInfo[geneSymbol] = {}
-                for keyName,ix in zip(extraFieldList[1:],range(len(extraFieldList[1:]))):
+def getColor(bed):
+    """Return the shade of the item according to ClinGen rules. Red or blue for score 3, everything
+        else gray."""
+    retColors = {"haplo": "128,128,128", "triplo": "128,128,128"}
+    if bed["Haploinsufficiency Score"] == "3": retColors["haplo"] = "255,0,0"
+    if bed["Triplosensitivity Score"] == "3": retColors["triplo"] = "0,255,0"
+    return retColors
+
+def getMouseover(bed):
+    """Return the mouseOver strings for this bed record."""
+    hapScore = bed["Haploinsufficiency Score"]
+    tripScore = bed["Triplosensitivity Score"]
+    ret = {"haplo": "", "triplo": ""}
+    mouseOver = "Gene/ISCA ID: %s" % bed["name"]
+    ret["triplo"] = mouseOver + ", Triplosensitivity: "
+    ret["triplo"] += "%s - %s" % (tripScore, scoreExplanation[int(tripScore)] if tripScore and tripScore != "Not yet evaluated" else -1)
+    ret["haplo"] = mouseOver + ", Haploinsufficiency: "
+    ret["haplo"] += "%s - %s" % (hapScore, scoreExplanation[int(hapScore)] if hapScore and hapScore != "Not yet evaluated" else -1)
+    return ret
+
+def parsePosition(posStr):
+    """Turn a chr:start-stop string into bed chrom, chromStart, chromEnd"""
     try:
-                        if fieldMatch.match(keyName):
-                            continue
-                        extraInfo[geneSymbol][keyName] = extra[ix]
+        colonDelim = posStr.split(":")
+        chrom = colonDelim[0]
+        dashDelim = colonDelim[1].split("-")
+        start = int(dashDelim[0]) - 1
+        end = int(dashDelim[1])
     except IndexError:
+        sys.stderr.write("bad position string: %s\n" % posStr)
         sys.exit(1)
+    return (chrom, start, end)
 
-def getColor(bed, dosageType):
-    """Return the shade of the item according to it's variant type."""
-    if dosageType == "haplo":
-        if bed["score"] == -1: return "218,215,215"
-        if bed["score"] == 0: return "130,128,128"
-        if bed["score"] == 1: return "232,104,111"
-        if bed["score"] == 2: return "218,44,55"
-        if bed["score"] == 3: return  "180,3,16"
-        if bed["score"] == 30: return "137,93,189"
-        if bed["score"] == 40: return "238,146,148"
-    if dosageType == "triplo":
-        if bed["score"] == -1: return "218,215,215"
-        if bed["score"] == 0: return "130,128,128"
-        if bed["score"] == 1: return "88,131,211"
-        if bed["score"] == 2: return "41,78,174"
-        if bed["score"] == 3: return "17,44,138"
-        if bed["score"] == 30: return "137,93,189"
-        if bed["score"] == 40: return "122,165,211"
-
-def getMouseover(bed, dosageType):
-    """Return the mouseOver string for this bed record."""
-    ret = "Gene: %s" % bed["name"]
-    if dosageType == "triplo":
-        ret += ", Triplosensitivity: "
+def makeBedLines():
+    """Turn inData into bed 9+"""
+    bedList = []
+    for key in inData:
+        data = inData[key]
+        bed = {}
+        chrom, start, end = parsePosition(data["Genomic Location"])
+        bed["chrom"] = chrom
+        bed["chromStart"] = start
+        bed["chromEnd"] = end
+        bed["name"] = key
+        bed["score"] = 0
+        bed["strand"] = "."
+        bed["thickStart"] = bed["chromStart"]
+        bed["thickEnd"] = bed["chromEnd"]
+        bed["Size"] = bed["chromEnd"] - bed["chromStart"]
+        bed["Gene Symbol/ISCA ID"] = key
+        extra = inData[key]
+        bed.update(extra)
+        bed["itemRgb"] = getColor(bed) # dict of colors
+        bed["_mouseOver"] = getMouseover(bed) # dict of mouseovers
+        bedList.append(bed)
+    return bedList
+
+def dumpBedLines(ofBase):
+    """For each line in makeBedLines(), write to two files: ofBase.haplo.bed and ofBase.triplo.bed"""
+    bedLines = makeBedLines()
+    combined = combinedFields + commonFields
+    extraFields = ["clinGen URL"] + [x for x in combined if x != "Genomic Location"]
+
+    # some fields differ depending on which file we are writing to, so remove them:
+    for field in haploFields + triploFields:
+        extraFields.remove(field)
+
+    if not ofBase.endswith("."):
+        ofBase += "."
+    haploOfh = ofBase + "haplo.bed"
+    triploOfh = ofBase + "triplo.bed"
+    haploFile = open(haploOfh, "w")
+    triploFile = open(triploOfh, "w")
+    print("#%s" % ("\t".join(bed8Fields + ["itemRgb"] + extraFields + haploFields + ["_mouseOver"])), file=haploFile)
+    print("#%s" % ("\t".join(bed8Fields + ["itemRgb"] + extraFields + triploFields + ["_mouseOver"])), file=triploFile)
+    for bed in bedLines:
+        finalBed = []
+        haploBed = []
+        triploBed = []
+
+        # first the standard bed fields
+        for field in bed8Fields:
+            if type(bed[field]) is list:
+                haploBed.append(bed[field])
+                triploBed.append(", ".join(bed[field]))
             else:
-        ret += ", Haplosensitivity: "
-    ret += "%s - %s" % (bed["score"], scoreExplanation[bed["score"]])
-    return ret
+                haploBed.append(str(bed[field]))
+                triploBed.append(str(bed[field]))
 
-def dumpBedLines(dosageType):
-    """Write out the bed lines, with the same number of fields in each line."""
-    oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss"
-    extraFields = [x for x in extraFieldList if not re.match(oppMatchStr, x, re.I) and x != "Gene Symbol" and x != "Genomic Location"]
-    fields = bed9Fields + extraFields+ ["_mouseOver"]
-    print("#%s" % ("\t".join(fields)))
-    for b in bedLines:
-        bed = bedLines[b]
-        finalBed = []
-        for field in fields:
+        # then the common extra fields:
+        haploBed.append(str(bed["itemRgb"]["haplo"]))
+        triploBed.append(str(bed["itemRgb"]["triplo"]))
+        for field in extraFields:
             try:
+                if field == "Gene ID/ISCA Region Name":
+                    if "Gene ID" in bed:
+                        haploBed.append(str(bed["Gene ID"]))
+                        triploBed.append(str(bed["Gene ID"]))
+                    else:
+                        haploBed.append(str(bed["ISCA Region Name"]))
+                        triploBed.append(str(bed["ISCA Region Name"]))
+                    continue
                 if type(bed[field]) is list:
-                    finalBed.append(", ".join(bed[field]))
+                    haploBed.append(", ".join(bed[field]))
+                    triploBed.append(", ".join(bed[field]))
                 else:
-                    finalBed.append(str(bed[field]))
-            except KeyError: # some of the extra fields won't exist for every record
-                finalBed.append("")
-        print("\t".join(finalBed))
+                    haploBed.append(str(bed[field]))
+                    triploBed.append(str(bed[field]))
+            except KeyError:
+                sys.stderr.write("ill formed bed:\n%s\n" % (bed))
+                sys.exit(1)
 
-def makeBedLine(chrom, chromStart, chromEnd, name, score, dosageType):
-    """Turn a single gvf line into a bed 9+ line."""
-    bed = {}
-    bed["chrom"] = chrom
-    bed["chromStart"] = chromStart
-    bed["chromEnd"] = chromEnd
-    bed["name"] = name
-    bed["score"] = score
-    bed["strand"] = "."
-    bed["thickStart"] = chromStart
-    bed["thickEnd"] = chromEnd
-    bed["itemRgb"] = getColor(bed, dosageType)
-    bed["Size"] = chromEnd - chromStart
-    extra = extraInfo[name]
-    bed.update(extra)
-    bed["_mouseOver"] = getMouseover(bed,dosageType)
-    return bed
-
-def processClinGenDosage(inf, dosageType):
-    global bedLines
-    lineCount = 1
-    for line in inf:
-        if line.startswith('#') or line.startswith('track') or line.startswith('browser'):
-            lineCount += 1
-            continue
-        trimmed = line.strip()
-        try:
-            chrom, chromStart, chromEnd, name, score = trimmed.split("\t")
-        except ValueError:
-            sys.stderr.write("Error: ignoring ill formatted bed line %s:%d\n" % (inf.name, lineCount))
-            lineCount += 1
+        # then the differing extra fields:
+        haploBed += [bed[field] for field in haploFields] + [bed["_mouseOver"]["haplo"]]
+        triploBed += [bed[field] for field in triploFields] + [bed ["_mouseOver"]["triplo"]]
+        print("\t".join(haploBed), file=haploFile)
+        print("\t".join(triploBed), file=triploFile)
+    haploFile.close()
+    triploFile.close()
+
+def parseInFile(fname, fileType="gene"):
+    if fname != "stdin":
+        f = open(fname)
+    lineNumber = 1
+    for line in f:
+        lineNumber += 1
+        bizarreRecord = False
+        if line.startswith("#"):
             continue
-        try:
-            bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, int(score), dosageType)
-        except:
-            # error here comes from something to do with the associated gene_curation_list and not
-            # the dosage file itself
-            if score.startswith("Not"):
-                bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, -1, dosageType)
+        l = line.strip("\n").split("\t")
+        key = l[0] # geneSymbol or ISCA id
+        extra = l[1:]
+        if key in inData:   
+            print("duplicate info for gene symbol: '%s'" % (geneSymbol))
+            sys.exit(1)
         else:
-                print(sys.exc_info()[0])
-                sys.stderr.write("bad input line:\n%s\n" % line)
+            temp = {}
+            fileFields = []
+            if fileType == "gene":
+                fileFields = geneListFields + commonFields
+            else:
+                fileFields = regionListFields + commonFields
+            for keyName,ix in zip(fileFields[1:],range(len(fileFields[1:]))):
+                try:
+                    if keyName == "Genomic Location" and extra[ix] == "tbd":
+                        bizarreRecord = True
+                    temp[keyName] = extra[ix]
+                except IndexError:
+                    sys.stderr.write("error line %d or file: %s\n" % (lineNumber, fname))
                     sys.exit(1)
-        lineCount += 1
-    dumpBedLines(dosageType)
+            if not bizarreRecord:
+                if fileType == "gene":
+                    temp["clinGen URL"] = clinGenGeneUrl + key
+                else:
+                    temp["clinGen URL"] = clinGenIscaUrl + key
+                inData[key] = temp
+    f.close()
 
 def main():
     args = setupCommandLine()
-    if args.extraFile:
-        parseExtraFile(args.extraFile, args.dosageType)
-    if args.infile == "stdin":
-        processClinGenDosage(sys.stdin,args.dosageType)
-    else:
-        with open(args.infile) as inf:
-            processClinGenDosage(inf, args.dosageType)
+    parseInFile(args.geneFile, "gene")
+    parseInFile(args.regionFile, "region")
+    dumpBedLines(args.outFileBase)
 
 if __name__=="__main__":
     main()