162bc86b24eb5186560aabb1c2eedf919602f89b
chmalee
  Tue Aug 18 12:29:16 2020 -0700
Forgot to commit helper script for turning decipher into bigBed, refs #25841

diff --git src/hg/utils/otto/decipher/processDecipher.py src/hg/utils/otto/decipher/processDecipher.py
new file mode 100755
index 0000000..df0f1af7
--- /dev/null
+++ src/hg/utils/otto/decipher/processDecipher.py
@@ -0,0 +1,153 @@
+#!/usr/bin/env python3
+
+import sys,argparse
+from collections import namedtuple,defaultdict
+
+bedFields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart",
+ "thickEnd", "itemRgb", "_size", "mean_ratio", "genotype", "variant_class", "inheritance", "pathogenicity", "contribution", "phenotypes"]
+geneFields = namedtuple("geneFields", bedFields[:4])
+
+def setupCommandLine():
+    parser = argparse.ArgumentParser(description="Read GVF lines from infile and transform to bed9+",
+        add_help=True, usage = "%(prog)s [options]")
+    parser.add_argument("bedFile", action="store", default=None, help="bed 16 file")
+    parser.add_argument("geneList", action="store", default=None, help="knownCanonical gene list")
+    args = parser.parse_args()
+    if args.bedFile is "stdin" and args.geneList is stdin:
+        sys.stderr.write("Error: only one of bedFile or geneList can be stdin.\n")
+        parser.print_help()
+        sys.exit(1)
+    return args
+
+def dumpBed(bed):
+    print("#" + "\t".join(bedFields + ["Affected Genes", "_mouseOver"]))
+    for patient in bed:
+        for cnv in bed[patient]:
+            ret = []
+            for field in cnv:
+                if "|" in field:
+                    ret.append(", ".join(field.split("|")))
+                else:
+                    ret.append(field)
+            print("\t".join(ret))
+
+def getParentTerm(childTerm):
+    lossTerms = ["Deletion"]
+    structVarTerms = ["Amplification"]
+    gainTerms = ["Copy-Number Gain", "Duplication", "Duplication/Triplication", "Triplication"]
+    if childTerm in lossTerms:
+        return "Loss"
+    elif childTerm in structVarTerms:
+        return "Structural alteration"
+    elif childTerm in gainTerms:
+        return "Gain"
+    else:
+        return "Unknown"
+
+pathSteps = {
+    1: ["unknown", "not provided", "others", "drug response"],
+    2: ["benign", "likely benign"],
+    3: ["protective", "conflicting", "affects"],
+    4: ["uncertain", "association", "risk factor"],
+    5: ["likely pathogenic", "pathogenic"]
+}
+lossShades = {1: "247,188,187", 2: "238,146,148", 3: "232,104,111", 4: "218,44,55", 5: "180,3,16"}
+gainShades = {1: "161,208,232", 2: "122,165,211", 3: "88,131,211", 4: "41,78,174", 5: "17,44,138"}
+structVarShades = {1: "166,235,182", 2: "96,208,121", 3: "47,172,76", 4: "6,104,28", 5: "1,69,17"}
+
+def getPathStep(pathogenicity):
+    """Return how many levels we need to increase darkness"""
+    for step in pathSteps:
+        if pathogenicity.lower() in pathSteps[step]:
+            return step
+    return -1
+
+def getColor(bed):
+    varClass = getParentTerm(bed[12])
+    pathogenicity = bed[14]
+    pathStep = getPathStep(pathogenicity)
+    if pathStep == -1:
+        sys.stderr.write("ERROR: unsupported pathogenicty: '%s'\n" % pathogenicity)
+        sys.exit(1)
+    ret = ""
+    if varClass == "Loss":
+        ret = lossShades[pathStep]
+    elif varClass == "Gain":
+        ret = gainShades[pathStep]
+    elif varClass == "Structural alteration":
+        ret = structVarShades[pathStep]
+    return ret
+    
+def getMouseOver(bed):
+    ret = "Position: %s:%s-%s, Size: %s, Type: %s, " % (bed[0], int(bed[1])+1, bed[2], bed[9], bed[12])
+    ret += "Significance: %s" % bed[14]
+    phenList = bed[16].split("|")
+    if phenList:
+        ret += ", Phenotypes: %s" % (", ".join(phenList[:2]))
+        if len(phenList) > 2:
+            ret += ", others - click to see full list."
+        else:
+            ret += "."
+    else:
+        ret += ", Phenotypes: none."
+    geneList = bed[17].split(", ")
+    if geneList:
+        ret += " Affected genes: %s" % (", ".join(geneList[:2]))
+        if len(geneList) > 2:
+            ret += ", others - click to see full list."
+        else:
+            ret += "."
+    else:
+        ret += ". Affected genes: none"
+    return ret
+
+def findOverlappingGenes(geneList, chrom, chromStart, chromEnd):
+    ret = []
+    for gene in geneList:
+        if chrom == gene.chrom and \
+            ((gene.chromStart <= chromStart and gene.chromEnd >= chromStart) or \
+            (gene.chromStart >= chromStart and gene.chromStart <= chromEnd) or \
+            (gene.chromStart >= chromStart and gene.chromEnd <= chromEnd)):
+            ret.append(gene.name)
+    ret = sorted(set(ret))
+    return "%s" % (", ".join(ret))
+
+def addGenesToBed(infile, geneList):
+    inBed = defaultdict(list)
+    infh = ""
+    if infile == "stdin":
+        infh = sys.stdin
+    else:
+        infh = open(infile)
+    for line in infh:
+        fields = line.strip('\n').split('\t')
+        patientId = fields[3]
+        overlapGenes = findOverlappingGenes(geneList, fields[0],int(fields[1]),int(fields[2]))
+        bedPartial = fields[:8] + [getColor(fields)] + fields[9:] + [overlapGenes]
+        inBed[patientId].append(bedPartial + [getMouseOver(bedPartial)])
+    infh.close()
+    return inBed
+
+def readGeneList(infile):
+    geneList = []
+    if infile == "stdin":
+        for line in sys.stdin:
+            fields = line.strip('\n').split('\t')
+            geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3]))
+    else:
+        with open(infile) as f:
+            for line in f:
+                fields = line.strip().split('\t')
+                geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3]))
+    return geneList
+
+def main():
+    args = setupCommandLine()
+    inBed = args.bedFile
+    inGenes = args.geneList
+    geneList = readGeneList(inGenes)
+    newBed = addGenesToBed(inBed, geneList)
+    dumpBed(newBed)
+
+if __name__ == "__main__":
+    main()