d6d6967d21a36badceed77f26525ec10c8540ebf jcasper Tue Apr 5 17:40:55 2022 -0700 Updates to the DECIPHER pipeline, first for a manual build, refs #29130 diff --git src/hg/utils/otto/decipher/processDecipher.py src/hg/utils/otto/decipher/processDecipher.py index cda0929..9bf61ca 100755 --- src/hg/utils/otto/decipher/processDecipher.py +++ src/hg/utils/otto/decipher/processDecipher.py @@ -1,151 +1,171 @@ -#!/usr/bin/env python3 - -import sys,argparse -from collections import namedtuple,defaultdict - -bedFields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", - "thickEnd", "itemRgb", "_size", "mean_ratio", "genotype", "variant_class", "inheritance", "pathogenicity", "contribution", "phenotypes"] -geneFields = namedtuple("geneFields", bedFields[:4]) - -def setupCommandLine(): - parser = argparse.ArgumentParser(description="Read GVF lines from infile and transform to bed9+", - add_help=True, usage = "%(prog)s [options]") - parser.add_argument("bedFile", action="store", default=None, help="bed 16 file") - parser.add_argument("geneList", action="store", default=None, help="knownCanonical gene list") - args = parser.parse_args() - if args.bedFile is "stdin" and args.geneList is stdin: - sys.stderr.write("Error: only one of bedFile or geneList can be stdin.\n") - parser.print_help() - sys.exit(1) - return args +#!/bin/env python3 +import sys +import json +from collections import namedtuple + +# Take a decipher variants file and parse it into separate CNV and SNV track beds, while amending +# some of the field values to match our display uses. For one thing, we change the colors. + -def dumpBed(bed): - print("#" + "\t".join(bedFields + ["Affected Genes", "_mouseOver"])) - for patient in bed: - for cnv in bed[patient]: +def findOverlappingGenes(geneList, chrom, chromStart, chromEnd): ret = [] - for field in cnv: - if "|" in field: - ret.append(", ".join(field.split("|"))) - else: - ret.append(field) - print("\t".join(ret)) + for gene in geneList: + if chrom == gene.chrom and \ + (gene.chromStart <= chromEnd and gene.chromEnd >= chromStart): + ret.append(gene.name) + ret = sorted(set(ret)) + return "%s" % (", ".join(ret)) + + +# functions and variables to support working out which color to assign to each CNV variant def getParentTerm(childTerm): lossTerms = ["Deletion"] structVarTerms = ["Amplification"] gainTerms = ["Copy-Number Gain", "Duplication", "Duplication/Triplication", "Triplication"] if childTerm in lossTerms: return "Loss" elif childTerm in structVarTerms: return "Structural alteration" elif childTerm in gainTerms: return "Gain" else: return "Unknown" pathSteps = { 1: ["benign", "likely benign"], 2: ["unknown", "uncertain", "not provided", "others", "drug response"], 3: ["likely pathogenic", "pathogenic"] } lossShades = {1: "255,128,128", 2: "255,0,0", 3: "153,0,0"} gainShades = {1: "133,177,255", 2: "0,0,255", 3: "0,0,128"} structVarShades = {1: "190,190,190", 2: "128,128,128", 3: "38,38,38"} def getPathStep(pathogenicity): """Return how many levels we need to increase darkness""" for step in pathSteps: if pathogenicity.lower() in pathSteps[step]: return step return -1 -def getColor(bed): - varClass = getParentTerm(bed[12]) - pathogenicity = bed[14] +def getColor(varClass, pathogenicity): + parentClass = getParentTerm(varClass) pathStep = getPathStep(pathogenicity) if pathStep == -1: return "0,0,0" ret = "" - if varClass == "Loss": + if parentClass == "Loss": ret = lossShades[pathStep] - elif varClass == "Gain": + elif parentClass == "Gain": ret = gainShades[pathStep] - elif varClass == "Structural alteration": + elif parentClass == "Structural alteration": ret = structVarShades[pathStep] return ret -def getMouseOver(bed): - ret = "Position: %s:%s-%s, Size: %s, Type: %s, " % (bed[0], int(bed[1])+1, bed[2], bed[9], bed[12]) - ret += "Significance: %s" % bed[14] - phenList = bed[16].split("|") - if phenList: - ret += ", Phenotypes: %s" % (", ".join(phenList[:2])) - if len(phenList) > 2: - ret += ", others - click to see full list." - else: - ret += "." - else: - ret += ", Phenotypes: none." - geneList = bed[17].split(", ") - if geneList: - ret += " Affected genes: %s" % (", ".join(geneList[:2])) - if len(geneList) > 2: - ret += ", others - click to see full list." - else: - ret += "." - else: - ret += ". Affected genes: none" - return ret -def findOverlappingGenes(geneList, chrom, chromStart, chromEnd): - ret = [] - for gene in geneList: - if chrom == gene.chrom and \ - ((gene.chromStart <= chromStart and gene.chromEnd >= chromStart) or \ - (gene.chromStart >= chromStart and gene.chromStart <= chromEnd) or \ - (gene.chromStart >= chromStart and gene.chromEnd <= chromEnd)): - ret.append(gene.name) - ret = sorted(set(ret)) - return "%s" % (", ".join(ret)) +############################################# -def addGenesToBed(infile, geneList): - inBed = defaultdict(list) - infh = "" - if infile == "stdin": - infh = sys.stdin - else: - infh = open(infile) - for line in infh: - fields = line.strip('\n').split('\t') - patientId = fields[3] - overlapGenes = findOverlappingGenes(geneList, fields[0],int(fields[1]),int(fields[2])) - bedPartial = fields[:8] + [getColor(fields)] + fields[9:] + [overlapGenes] - inBed[patientId].append(bedPartial + [getMouseOver(bedPartial)]) - infh.close() - return inBed - -def readGeneList(infile): +def main(): + if len(sys.argv) != 6: + print ("Usage: makeUcscBed.py <input file> <genefile> <cnv output file> <snv output file> <snv raw output file>") + sys.exit(0) + + cnvOutput = open(sys.argv[3], "w") + snvOutput = open(sys.argv[4], "w") + snvRawOutput = open(sys.argv[5], "w") + + geneFields = namedtuple("geneFields", ["chrom", "chromStart", "chromEnd", "name"]) geneList = [] - if infile == "stdin": - for line in sys.stdin: + with open(sys.argv[2]) as geneFile: + for line in geneFile: fields = line.strip('\n').split('\t') geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3])) + + with open(sys.argv[1], "r") as inputFile: + header = inputFile.readline() + for line in inputFile: + fields = line.strip('\n').split('\t') + parsedJson = json.loads(fields[13]) + + varType = parsedJson["variant_type"]; + if (varType == "SNV") : + # Make the basic table file + result = "\t".join(fields[:4]) + print (result, file=snvOutput) + + # Make the "raw" data file with more fields + # format: id, chrom, start, end, ref, alt, transcript, gene, genotype, inheritance, pathgenicity, + # contribution, phenotypes + fields[0] = fields[0].replace("chr", "") # strip off initial "chr" for these names + # turn a 0-based start into 1-based for this specific output file for legacy reasons + if (fields[1] != fields[2]) : + fields[1] = str(int(fields[1])+1) + result = fields[3] + "\t" + "\t".join(fields[:3]) + # phenotypes is itself a dictionary that we need to unpack; the current internal format + # expectation is that the phenotypes are separated by | in the snvRaw file + phenotypes = "|".join(sorted(x["name"] for x in parsedJson["phenotypes"])) + transcript = parsedJson.get("transcript", "") # default to the empty string + gene = parsedJson.get("gene", "") # default to the empty string + result += "\t" + "\t".join( (parsedJson["ref_allele"], parsedJson["alt_allele"], transcript, \ + gene, parsedJson["genotype"], parsedJson["inheritance"], parsedJson["pathogenicity"], \ + parsedJson["contribution"], phenotypes) ) + print (result, file=snvRawOutput) else: - with open(infile) as f: - for line in f: - fields = line.strip().split('\t') - geneList.append(geneFields(chrom=fields[0],chromStart=int(fields[1]),chromEnd=int(fields[2]),name=fields[3])) - return geneList + # just sanity check here to ensure varType hasn't gone awry + if (varType != "CNV"): + print ("Error: unexpected variant_type {} in input file\n".format(varType)) + sys.exit(1) + fields[5] = "." # Input uses +, but these aren't directional items + result = "\t".join(fields[:8]) # everything through thickEnd is fine + # color is determined by variant_class + varClass = parsedJson["variant_class"] + pathogenicity = parsedJson["pathogenicity"] + color = getColor(varClass, pathogenicity) + result += "\t" + color + # remaining fields to make: extent, mean ratio, genotype, variant_class, inheritance, pathogenicity, + # contribution, phenotypes, Affected Genes, mouseover text + phenotypeString = ", ".join(sorted(x["name"] for x in parsedJson["phenotypes"])) + meanRatio = parsedJson["mean_ratio"] + if meanRatio is None: + meanRatio = "0.0" # Not sure this is the best way to handle this case, but it's a start + meanRatioString = "%0.2f" % float(meanRatio) + result += "\t" + "\t".join( (fields[10], meanRatioString, parsedJson["genotype"], \ + parsedJson["variant_class"], parsedJson["inheritance"], parsedJson["pathogenicity"], \ + parsedJson["contribution"], phenotypeString) ) -def main(): - args = setupCommandLine() - inBed = args.bedFile - inGenes = args.geneList - geneList = readGeneList(inGenes) - newBed = addGenesToBed(inBed, geneList) - dumpBed(newBed) + # still have to do Affected Genes and the mouseover text + geneString = findOverlappingGenes(geneList, fields[0], int(fields[1]), int(fields[2])) + result += "\t" + geneString + + mouseOver = "Position: %s:%s-%s, Size: %s, Type: %s, " % (fields[0], int(fields[1])+1, \ + fields[2], fields[10], varClass) + mouseOver += "Significance: %s" % pathogenicity + phenotypes = sorted(x["name"] for x in parsedJson["phenotypes"]) + if len(phenotypes) > 0: + mouseOver += ", Phenotypes: %s" % (", ".join(phenotypes[:2])) + if len(phenotypes) > 2: + mouseOver += ", others - click to see full list." + else: + mouseOver += "." + else: + mouseOver += ", Phenotypes: none." + thisGeneList = geneString.split(", ") + if thisGeneList: + mouseOver += " Affected genes: %s" % (", ".join(thisGeneList[:2])) + if len(thisGeneList) > 2: + mouseOver += ", others - click to see full list." + else: + mouseOver += "." + else: + mouseOver += ". Affected genes: none" + + result += "\t" + mouseOver + print (result, file=cnvOutput) + + cnvOutput.close() + snvOutput.close() + snvRawOutput.close() if __name__ == "__main__": main()