src/hg/utils/otto/decipher/processDecipher.py f5a1b930828e2737ee09d3dc71b3a0c9c2068452

f5a1b930828e2737ee09d3dc71b3a0c9c2068452
max
  Wed Mar 27 08:21:03 2024 -0700
otto job for decipher failed, otto email

diff --git src/hg/utils/otto/decipher/processDecipher.py src/hg/utils/otto/decipher/processDecipher.py
index 9bf61ca..7479478 100755
--- src/hg/utils/otto/decipher/processDecipher.py
+++ src/hg/utils/otto/decipher/processDecipher.py
@@ -1,171 +1,171 @@
 #!/bin/env python3
 import sys
 import json
 from collections import namedtuple
 
 # Take a decipher variants file and parse it into separate CNV and SNV track beds, while amending
 # some of the field values to match our display uses.  For one thing, we change the colors.
 
 
 def findOverlappingGenes(geneList, chrom, chromStart, chromEnd):
     ret = []
     for gene in geneList:
         if chrom == gene.chrom and \
             (gene.chromStart <= chromEnd and gene.chromEnd >= chromStart):
             ret.append(gene.name)
     ret = sorted(set(ret))
     return "%s" % (", ".join(ret))
 
 
 # functions and variables to support working out which color to assign to each CNV variant
 
 def getParentTerm(childTerm):
     lossTerms = ["Deletion"]
     structVarTerms = ["Amplification"]
     gainTerms = ["Copy-Number Gain", "Duplication", "Duplication/Triplication", "Triplication"]
     if childTerm in lossTerms:
         return "Loss"
     elif childTerm in structVarTerms:
         return "Structural alteration"
     elif childTerm in gainTerms:
         return "Gain"
     else:
         return "Unknown"
 
 pathSteps = {
     1: ["benign", "likely benign"],
     2: ["unknown", "uncertain", "not provided", "others", "drug response"],
     3: ["likely pathogenic", "pathogenic"]
 }
 
 lossShades = {1: "255,128,128", 2: "255,0,0", 3: "153,0,0"}
 gainShades = {1: "133,177,255", 2: "0,0,255", 3: "0,0,128"}
 structVarShades = {1: "190,190,190", 2: "128,128,128", 3: "38,38,38"}
 
 def getPathStep(pathogenicity):
     """Return how many levels we need to increase darkness"""
     for step in pathSteps:
         if pathogenicity.lower() in pathSteps[step]:
             return step
     return -1
 
 def getColor(varClass, pathogenicity):
     parentClass = getParentTerm(varClass)
     pathStep = getPathStep(pathogenicity)
     if pathStep == -1:
         return "0,0,0"
     ret = ""
     if parentClass == "Loss":
         ret = lossShades[pathStep]
     elif parentClass == "Gain":
         ret = gainShades[pathStep]
     elif parentClass == "Structural alteration":
         ret = structVarShades[pathStep]
     return ret
 
 
 #############################################
 
 def main():
     if len(sys.argv) != 6:
         print ("Usage: makeUcscBed.py <input file> <genefile> <cnv output file> <snv output file> <snv raw output file>")
         sys.exit(0)
 
     cnvOutput = open(sys.argv[3], "w")
     snvOutput = open(sys.argv[4], "w")
     snvRawOutput = open(sys.argv[5], "w")
 
     geneFields = namedtuple("geneFields", ["chrom", "chromStart", "chromEnd", "name"])
     geneList = []
     with open(sys.argv[2]) as geneFile:
         for line in geneFile:
             fields = line.strip('\n').split('\t')
             geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3]))
 
-    with open(sys.argv[1], "r") as inputFile:
+    with open(sys.argv[1], "r", encoding="utf8") as inputFile:
         header = inputFile.readline()
         for line in inputFile:
             fields = line.strip('\n').split('\t')
             parsedJson = json.loads(fields[13])
 
             varType = parsedJson["variant_type"];
             if (varType == "SNV") :
                 # Make the basic table file
                 result = "\t".join(fields[:4])
                 print (result, file=snvOutput)
 
                 # Make the "raw" data file with more fields
                 # format: id, chrom, start, end, ref, alt, transcript, gene, genotype, inheritance, pathgenicity,
                     # contribution, phenotypes
                 fields[0] = fields[0].replace("chr", "")  # strip off initial "chr" for these names
                 # turn a 0-based start into 1-based for this specific output file for legacy reasons
                 if (fields[1] != fields[2]) :
                     fields[1] = str(int(fields[1])+1)  
                 result = fields[3] + "\t" + "\t".join(fields[:3])
                 # phenotypes is itself a dictionary that we need to unpack; the current internal format
                 # expectation is that the phenotypes are separated by | in the snvRaw file
                 phenotypes = "|".join(sorted(x["name"] for x in parsedJson["phenotypes"]))
                 transcript = parsedJson.get("transcript", "") # default to the empty string
                 gene = parsedJson.get("gene", "") # default to the empty string
                 result += "\t" + "\t".join( (parsedJson["ref_allele"], parsedJson["alt_allele"], transcript, \
                     gene, parsedJson["genotype"], parsedJson["inheritance"], parsedJson["pathogenicity"], \
                     parsedJson["contribution"], phenotypes) )
                 print (result, file=snvRawOutput)
             else:
                 # just sanity check here to ensure varType hasn't gone awry
                 if (varType != "CNV"):
                     print ("Error: unexpected variant_type {} in input file\n".format(varType))
                     sys.exit(1)
                 fields[5] = "."  # Input uses +, but these aren't directional items
                 result = "\t".join(fields[:8])  # everything through thickEnd is fine
                 # color is determined by variant_class
                 varClass = parsedJson["variant_class"]
                 pathogenicity = parsedJson["pathogenicity"]
                 color = getColor(varClass, pathogenicity)
                 result += "\t" + color
                 # remaining fields to make: extent, mean ratio, genotype, variant_class, inheritance, pathogenicity,
                     # contribution, phenotypes, Affected Genes, mouseover text
                 phenotypeString = ", ".join(sorted(x["name"] for x in parsedJson["phenotypes"]))
                 meanRatio = parsedJson["mean_ratio"]
                 if meanRatio is None:
                     meanRatio = "0.0"  # Not sure this is the best way to handle this case, but it's a start
                 meanRatioString = "%0.2f" % float(meanRatio)
                 result += "\t" + "\t".join( (fields[10], meanRatioString, parsedJson["genotype"], \
                     parsedJson["variant_class"], parsedJson["inheritance"], parsedJson["pathogenicity"], \
                     parsedJson["contribution"], phenotypeString) )
 
                 # still have to do Affected Genes and the mouseover text
                 geneString = findOverlappingGenes(geneList, fields[0], int(fields[1]), int(fields[2]))
                 result += "\t" + geneString
 
                 mouseOver = "Position: %s:%s-%s, Size: %s, Type: %s, " % (fields[0], int(fields[1])+1, \
                         fields[2], fields[10], varClass)
                 mouseOver += "Significance: %s" % pathogenicity
                 phenotypes = sorted(x["name"] for x in parsedJson["phenotypes"])
                 if len(phenotypes) > 0:
                     mouseOver += ", Phenotypes: %s" % (", ".join(phenotypes[:2]))
                     if len(phenotypes) > 2:
                         mouseOver += ", others - click to see full list."
                     else:
                         mouseOver += "."
                 else:
                     mouseOver += ", Phenotypes: none."
                 thisGeneList = geneString.split(", ")
                 if thisGeneList:
                     mouseOver += " Affected genes: %s" % (", ".join(thisGeneList[:2]))
                     if len(thisGeneList) > 2:
                         mouseOver += ", others - click to see full list."
                     else:
                         mouseOver += "."
                 else:
                     mouseOver += ". Affected genes: none"
 
                 result += "\t" + mouseOver
                 print (result, file=cnvOutput)
 
     cnvOutput.close()
     snvOutput.close()
     snvRawOutput.close()
 
 if __name__ == "__main__":
     main()