src/hg/utils/otto/decipher/processDecipher.py d6d6967d21a36badceed77f26525ec10c8540ebf

d6d6967d21a36badceed77f26525ec10c8540ebf
jcasper
  Tue Apr 5 17:40:55 2022 -0700
Updates to the DECIPHER pipeline, first for a manual build, refs #29130

diff --git src/hg/utils/otto/decipher/processDecipher.py src/hg/utils/otto/decipher/processDecipher.py
index cda0929..9bf61ca 100755
--- src/hg/utils/otto/decipher/processDecipher.py
+++ src/hg/utils/otto/decipher/processDecipher.py
@@ -1,151 +1,171 @@
-#!/usr/bin/env python3
-
-import sys,argparse
-from collections import namedtuple,defaultdict
-
-bedFields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart",
- "thickEnd", "itemRgb", "_size", "mean_ratio", "genotype", "variant_class", "inheritance", "pathogenicity", "contribution", "phenotypes"]
-geneFields = namedtuple("geneFields", bedFields[:4])
-
-def setupCommandLine():
-    parser = argparse.ArgumentParser(description="Read GVF lines from infile and transform to bed9+",
-        add_help=True, usage = "%(prog)s [options]")
-    parser.add_argument("bedFile", action="store", default=None, help="bed 16 file")
-    parser.add_argument("geneList", action="store", default=None, help="knownCanonical gene list")
-    args = parser.parse_args()
-    if args.bedFile is "stdin" and args.geneList is stdin:
-        sys.stderr.write("Error: only one of bedFile or geneList can be stdin.\n")
-        parser.print_help()
-        sys.exit(1)
-    return args
+#!/bin/env python3
+import sys
+import json
+from collections import namedtuple
+
+# Take a decipher variants file and parse it into separate CNV and SNV track beds, while amending
+# some of the field values to match our display uses.  For one thing, we change the colors.
+
 
-def dumpBed(bed):
-    print("#" + "\t".join(bedFields + ["Affected Genes", "_mouseOver"]))
-    for patient in bed:
-        for cnv in bed[patient]:
+def findOverlappingGenes(geneList, chrom, chromStart, chromEnd):
     ret = []
-            for field in cnv:
-                if "|" in field:
-                    ret.append(", ".join(field.split("|")))
-                else:
-                    ret.append(field)
-            print("\t".join(ret))
+    for gene in geneList:
+        if chrom == gene.chrom and \
+            (gene.chromStart <= chromEnd and gene.chromEnd >= chromStart):
+            ret.append(gene.name)
+    ret = sorted(set(ret))
+    return "%s" % (", ".join(ret))
+
+
+# functions and variables to support working out which color to assign to each CNV variant
 
 def getParentTerm(childTerm):
     lossTerms = ["Deletion"]
     structVarTerms = ["Amplification"]
     gainTerms = ["Copy-Number Gain", "Duplication", "Duplication/Triplication", "Triplication"]
     if childTerm in lossTerms:
         return "Loss"
     elif childTerm in structVarTerms:
         return "Structural alteration"
     elif childTerm in gainTerms:
         return "Gain"
     else:
         return "Unknown"
 
 pathSteps = {
     1: ["benign", "likely benign"],
     2: ["unknown", "uncertain", "not provided", "others", "drug response"],
     3: ["likely pathogenic", "pathogenic"]
 }
 
 lossShades = {1: "255,128,128", 2: "255,0,0", 3: "153,0,0"}
 gainShades = {1: "133,177,255", 2: "0,0,255", 3: "0,0,128"}
 structVarShades = {1: "190,190,190", 2: "128,128,128", 3: "38,38,38"}
 
 def getPathStep(pathogenicity):
     """Return how many levels we need to increase darkness"""
     for step in pathSteps:
         if pathogenicity.lower() in pathSteps[step]:
             return step
     return -1
 
-def getColor(bed):
-    varClass = getParentTerm(bed[12])
-    pathogenicity = bed[14]
+def getColor(varClass, pathogenicity):
+    parentClass = getParentTerm(varClass)
     pathStep = getPathStep(pathogenicity)
     if pathStep == -1:
         return "0,0,0"
     ret = ""
-    if varClass == "Loss":
+    if parentClass == "Loss":
         ret = lossShades[pathStep]
-    elif varClass == "Gain":
+    elif parentClass == "Gain":
         ret = gainShades[pathStep]
-    elif varClass == "Structural alteration":
+    elif parentClass == "Structural alteration":
         ret = structVarShades[pathStep]
     return ret
 
-def getMouseOver(bed):
-    ret = "Position: %s:%s-%s, Size: %s, Type: %s, " % (bed[0], int(bed[1])+1, bed[2], bed[9], bed[12])
-    ret += "Significance: %s" % bed[14]
-    phenList = bed[16].split("|")
-    if phenList:
-        ret += ", Phenotypes: %s" % (", ".join(phenList[:2]))
-        if len(phenList) > 2:
-            ret += ", others - click to see full list."
-        else:
-            ret += "."
-    else:
-        ret += ", Phenotypes: none."
-    geneList = bed[17].split(", ")
-    if geneList:
-        ret += " Affected genes: %s" % (", ".join(geneList[:2]))
-        if len(geneList) > 2:
-            ret += ", others - click to see full list."
-        else:
-            ret += "."
-    else:
-        ret += ". Affected genes: none"
-    return ret
 
-def findOverlappingGenes(geneList, chrom, chromStart, chromEnd):
-    ret = []
-    for gene in geneList:
-        if chrom == gene.chrom and \
-            ((gene.chromStart <= chromStart and gene.chromEnd >= chromStart) or \
-            (gene.chromStart >= chromStart and gene.chromStart <= chromEnd) or \
-            (gene.chromStart >= chromStart and gene.chromEnd <= chromEnd)):
-            ret.append(gene.name)
-    ret = sorted(set(ret))
-    return "%s" % (", ".join(ret))
+#############################################
 
-def addGenesToBed(infile, geneList):
-    inBed = defaultdict(list)
-    infh = ""
-    if infile == "stdin":
-        infh = sys.stdin
-    else:
-        infh = open(infile)
-    for line in infh:
-        fields = line.strip('\n').split('\t')
-        patientId = fields[3]
-        overlapGenes = findOverlappingGenes(geneList, fields[0],int(fields[1]),int(fields[2]))
-        bedPartial = fields[:8] + [getColor(fields)] + fields[9:] + [overlapGenes]
-        inBed[patientId].append(bedPartial + [getMouseOver(bedPartial)])
-    infh.close()
-    return inBed
-
-def readGeneList(infile):
+def main():
+    if len(sys.argv) != 6:
+        print ("Usage: makeUcscBed.py <input file> <genefile> <cnv output file> <snv output file> <snv raw output file>")
+        sys.exit(0)
+
+    cnvOutput = open(sys.argv[3], "w")
+    snvOutput = open(sys.argv[4], "w")
+    snvRawOutput = open(sys.argv[5], "w")
+
+    geneFields = namedtuple("geneFields", ["chrom", "chromStart", "chromEnd", "name"])
     geneList = []
-    if infile == "stdin":
-        for line in sys.stdin:
+    with open(sys.argv[2]) as geneFile:
+        for line in geneFile:
             fields = line.strip('\n').split('\t')
             geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3]))
+
+    with open(sys.argv[1], "r") as inputFile:
+        header = inputFile.readline()
+        for line in inputFile:
+            fields = line.strip('\n').split('\t')
+            parsedJson = json.loads(fields[13])
+
+            varType = parsedJson["variant_type"];
+            if (varType == "SNV") :
+                # Make the basic table file
+                result = "\t".join(fields[:4])
+                print (result, file=snvOutput)
+
+                # Make the "raw" data file with more fields
+                # format: id, chrom, start, end, ref, alt, transcript, gene, genotype, inheritance, pathgenicity,
+                    # contribution, phenotypes
+                fields[0] = fields[0].replace("chr", "")  # strip off initial "chr" for these names
+                # turn a 0-based start into 1-based for this specific output file for legacy reasons
+                if (fields[1] != fields[2]) :
+                    fields[1] = str(int(fields[1])+1)  
+                result = fields[3] + "\t" + "\t".join(fields[:3])
+                # phenotypes is itself a dictionary that we need to unpack; the current internal format
+                # expectation is that the phenotypes are separated by | in the snvRaw file
+                phenotypes = "|".join(sorted(x["name"] for x in parsedJson["phenotypes"]))
+                transcript = parsedJson.get("transcript", "") # default to the empty string
+                gene = parsedJson.get("gene", "") # default to the empty string
+                result += "\t" + "\t".join( (parsedJson["ref_allele"], parsedJson["alt_allele"], transcript, \
+                    gene, parsedJson["genotype"], parsedJson["inheritance"], parsedJson["pathogenicity"], \
+                    parsedJson["contribution"], phenotypes) )
+                print (result, file=snvRawOutput)
             else:
-        with open(infile) as f:
-            for line in f:
-                fields = line.strip().split('\t')
-                geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3]))
-    return geneList
+                # just sanity check here to ensure varType hasn't gone awry
+                if (varType != "CNV"):
+                    print ("Error: unexpected variant_type {} in input file\n".format(varType))
+                    sys.exit(1)
+                fields[5] = "."  # Input uses +, but these aren't directional items
+                result = "\t".join(fields[:8])  # everything through thickEnd is fine
+                # color is determined by variant_class
+                varClass = parsedJson["variant_class"]
+                pathogenicity = parsedJson["pathogenicity"]
+                color = getColor(varClass, pathogenicity)
+                result += "\t" + color
+                # remaining fields to make: extent, mean ratio, genotype, variant_class, inheritance, pathogenicity,
+                    # contribution, phenotypes, Affected Genes, mouseover text
+                phenotypeString = ", ".join(sorted(x["name"] for x in parsedJson["phenotypes"]))
+                meanRatio = parsedJson["mean_ratio"]
+                if meanRatio is None:
+                    meanRatio = "0.0"  # Not sure this is the best way to handle this case, but it's a start
+                meanRatioString = "%0.2f" % float(meanRatio)
+                result += "\t" + "\t".join( (fields[10], meanRatioString, parsedJson["genotype"], \
+                    parsedJson["variant_class"], parsedJson["inheritance"], parsedJson["pathogenicity"], \
+                    parsedJson["contribution"], phenotypeString) )
 
-def main():
-    args = setupCommandLine()
-    inBed = args.bedFile
-    inGenes = args.geneList
-    geneList = readGeneList(inGenes)
-    newBed = addGenesToBed(inBed, geneList)
-    dumpBed(newBed)
+                # still have to do Affected Genes and the mouseover text
+                geneString = findOverlappingGenes(geneList, fields[0], int(fields[1]), int(fields[2]))
+                result += "\t" + geneString
+
+                mouseOver = "Position: %s:%s-%s, Size: %s, Type: %s, " % (fields[0], int(fields[1])+1, \
+                        fields[2], fields[10], varClass)
+                mouseOver += "Significance: %s" % pathogenicity
+                phenotypes = sorted(x["name"] for x in parsedJson["phenotypes"])
+                if len(phenotypes) > 0:
+                    mouseOver += ", Phenotypes: %s" % (", ".join(phenotypes[:2]))
+                    if len(phenotypes) > 2:
+                        mouseOver += ", others - click to see full list."
+                    else:
+                        mouseOver += "."
+                else:
+                    mouseOver += ", Phenotypes: none."
+                thisGeneList = geneString.split(", ")
+                if thisGeneList:
+                    mouseOver += " Affected genes: %s" % (", ".join(thisGeneList[:2]))
+                    if len(thisGeneList) > 2:
+                        mouseOver += ", others - click to see full list."
+                    else:
+                        mouseOver += "."
+                else:
+                    mouseOver += ". Affected genes: none"
+
+                result += "\t" + mouseOver
+                print (result, file=cnvOutput)
+
+    cnvOutput.close()
+    snvOutput.close()
+    snvRawOutput.close()
 
 if __name__ == "__main__":
     main()