37befdf6676de0022c60966291e21e5977dba086
chmalee
  Fri Jun 19 13:46:31 2020 -0700
Adding clingen cnv and dosage otto job to source tree now that it's been running for a while, refs #25562

diff --git src/hg/utils/otto/clinGen/processClinGenDosage.py src/hg/utils/otto/clinGen/processClinGenDosage.py
new file mode 100755
index 0000000..feb63e5
--- /dev/null
+++ src/hg/utils/otto/clinGen/processClinGenDosage.py
@@ -0,0 +1,170 @@
+#!/usr/bin/env python3
+
+"""
+Read GVF lines from stdin and write bed9+ lines to stdout
+"""
+
+import os,sys,re
+import argparse
+
+bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart",
+    "thickEnd", "itemRgb"]
+
+# we only use some of these fields depending on whether we are working with haplo or triplo scores
+extraFieldList = ["Gene Symbol", "Gene ID", "cytoBand", "Genomic Location",
+    "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1",
+    "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score",
+    "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2",
+    "Triplosensitivity PMID3", "Date Last Evaluated", "Loss phenotype OMIM ID",
+    "Triplosensitive phenotype OMIM ID"]
+
+scoreExplanation = {
+    -1: "not yet evaluated",
+    0: "no evidence for dosage pathogenicity",
+    1: "little evidence for dosage pathogenicity",
+    2: "some evidence for dosage pathogenicity",
+    3: "sufficient evidence for dosage pathogenicity",
+    30: "gene associated with autosomal recessive phenotype",
+    40: "haploinsufficiency unlikely"
+    }
+
+# hold all the bedlines in memory until the end as different lines have 
+# different numbers of extra fields
+bedLines = {}
+
+# the phenotypes and other stuff associated with a particular gene
+extraInfo = {}
+
+def setupCommandLine():
+    parser = argparse.ArgumentParser(description="Read bed lines from infile, join with extraFile and transform to bed9+",
+        add_help=True, usage = "%(prog)s [options]")
+    parser.add_argument("infile", action="store", default=None, help="Input bed file from which to read input, use 'stdin' to read from default stdin")
+    parser.add_argument("extraFile", action="store", default=None, help="a ClinGen_gene_curation_list file with extra info for the records in the infile")
+    parser.add_argument("dosageType", action="store", default=None, help="haplo or triplo, which extraFields to use for this bed record")
+    args = parser.parse_args()
+    if args.infile == "stdin" and args.extraFile == "stdin":
+        print("Only one of infile or extraFile can be stdin")
+        sys.exit(1)
+    return args
+
+def parseExtraFile(fname, dosageType):
+    global extraInfo
+    oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss"
+    fieldMatch = re.compile(oppMatchStr,re.I)
+    with open(fname) as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            l = line.strip("\n").split("\t")
+            geneSymbol = l[0]
+            extra = l[1:]
+            try:
+                if extraInfo[geneSymbol]:   
+                    print("duplicate info for gene symbol: '%s'" % (geneSymbol))
+                    sys.exit(1)
+            except KeyError:
+                extraInfo[geneSymbol] = {}
+                for keyName,ix in zip(extraFieldList[1:],range(len(extraFieldList[1:]))):
+                    try:
+                        if fieldMatch.match(keyName):
+                            continue
+                        extraInfo[geneSymbol][keyName] = extra[ix]
+                    except IndexError:
+                        sys.exit(1)
+
+def getColor(bed, dosageType):
+    """Return the shade of the item according to it's variant type."""
+    if dosageType == "haplo":
+        if bed["score"] == -1: return "128,128,128"
+        if bed["score"] == 0: return "191,191,191"
+        if bed["score"] == 1: return "191,115,115"
+        if bed["score"] == 2: return "191,93,96"
+        if bed["score"] == 3: return  "255,51,0"
+        if bed["score"] == 30: return "160,136,246"
+        if bed["score"] == 40: return "255,204,204"
+    if dosageType == "triplo":
+        if bed["score"] == -1: return "128,128,128"
+        if bed["score"] == 0: return "191,191,191"
+        if bed["score"] == 1: return "180,198,231"
+        if bed["score"] == 2: return "142,169,219"
+        if bed["score"] == 3: return "48,84,150"
+        if bed["score"] == 30: return "160,136,246"
+        if bed["score"] == 40: return "217,255,242"
+
+def getMouseover(bed, dosageType):
+    """Return the mouseOver string for this bed record."""
+    ret = "Gene: %s" % bed["name"]
+    if dosageType == "triplo":
+        ret += ", Triplosensitivity: "
+    else:
+        ret += ", Haplosensitivity: "
+    ret += "%s - %s" % (bed["score"], scoreExplanation[bed["score"]])
+    return ret
+
+def dumpBedLines(dosageType):
+    """Write out the bed lines, with the same number of fields in each line."""
+    oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss"
+    extraFields = [x for x in extraFieldList if not re.match(oppMatchStr, x, re.I) and x != "Gene Symbol" and x != "Genomic Location"]
+    fields = bed9Fields + extraFields+ ["_mouseOver"]
+    print("#%s" % ("\t".join(fields)))
+    for b in bedLines:
+        bed = bedLines[b]
+        finalBed = []
+        for field in fields:
+            try:
+                if type(bed[field]) is list:
+                    finalBed.append(", ".join(bed[field]))
+                else:
+                    finalBed.append(str(bed[field]))
+            except KeyError: # some of the extra fields won't exist for every record
+                finalBed.append("")
+        print("\t".join(finalBed))
+
+def makeBedLine(chrom, chromStart, chromEnd, name, score, dosageType):
+    """Turn a single gvf line into a bed 9+ line."""
+    bed = {}
+    bed["chrom"] = chrom
+    bed["chromStart"] = chromStart
+    bed["chromEnd"] = chromEnd
+    bed["name"] = name
+    bed["score"] = score
+    bed["strand"] = "."
+    bed["thickStart"] = chromStart
+    bed["thickEnd"] = chromEnd
+    bed["itemRgb"] = getColor(bed, dosageType)
+    bed["Size"] = chromEnd - chromStart
+    extra = extraInfo[name]
+    bed.update(extra)
+    bed["_mouseOver"] = getMouseover(bed,dosageType)
+    return bed
+
+def processClinGenDosage(inf, dosageType):
+    global bedLines
+    for line in inf:
+        if line.startswith('#') or line.startswith('track') or line.startswith('browser'):
+            continue
+        trimmed = line.strip()
+        chrom, chromStart, chromEnd, name, score = trimmed.split("\t")
+        try:
+            bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, int(score), dosageType)
+        except:
+            if score.startswith("Not"):
+                bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, -1, dosageType)
+            else:
+                print(sys.exc_info()[0])
+                sys.stderr.write("bad input line:\n%s\n" % line)
+                sys.exit(1)
+    dumpBedLines(dosageType)
+
+def main():
+    args = setupCommandLine()
+    if args.extraFile:
+        parseExtraFile(args.extraFile, args.dosageType)
+    if args.infile == "stdin":
+        processClinGenDosage(sys.stdin,args.dosageType)
+    else:
+        with open(args.infile) as inf:
+            processClinGenDosage(inf, args.dosageType)
+
+if __name__=="__main__":
+    main()