34b96568359c6125704a1eb7455dd39e99744a72 chmalee Thu Jul 2 10:36:03 2020 -0700 Fix up the ClinGen Dosage and CNV colors and mouseovers, refs #24818,#25562 diff --git src/hg/utils/otto/clinGen/processClinGenCnv.py src/hg/utils/otto/clinGen/processClinGenCnv.py index b867e9f..afd2120 100755 --- src/hg/utils/otto/clinGen/processClinGenCnv.py +++ src/hg/utils/otto/clinGen/processClinGenCnv.py @@ -1,197 +1,197 @@ #!/usr/bin/env python3 """ Read GVF lines from stdin and write bed9+ lines to stdout """ import os,sys,re import argparse bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb"] # the following list of fields may not exist for every record in the input file extraFieldList= ["Size", "Alias", "Variant Region", "Start Range", "End Range", "Copy Number", "Variant Type", "OMIM", "ClinGen", "Phenotype", "Phenotype Id", "PubMed", "Clinical Interpretation", "Clinical Source", "Sample Name", "Sampleset Name"] #, "Variant Seq", "Reference Seq", "RemapScore"] #extraFieldsSet = set() # hold all the bedlines in memory until the end as different lines have # different numbers of extra fields bedLines = {} chromLift = {} def setupCommandLine(): parser = argparse.ArgumentParser(description="Read GVF lines from infile and transform to bed9+", add_help=True, usage = "%(prog)s [options]") parser.add_argument("infile", action="store", default=None, help="Input GVF file from which to read input, use 'stdin' to read from default stdin") parser.add_argument("liftFile", action="store", default=None, help="liftUp file for converting chrom names to UCSC style names") args = parser.parse_args() return args def parseLiftFile(fname): global chromLift with open(fname) as f: for line in f: l = line.strip().split() ncbiChrom = l[1] ucscChrom = l[3] chromLift[ncbiChrom] = ucscChrom def getColor(cnvtype, pathOrBen): """Return the shade of the item according to it's variant type.""" if cnvtype not in ["copy_number_loss","copy_number_gain"] or pathOrBen not in ["Benign", "Pathogenic"]: return "0,0,0" if cnvtype == "copy_number_loss": if pathOrBen == "Pathogenic": - return "255,104,104" + return "180,3,16" else: - return "231,154,144" + return "238,146,148" if cnvtype == "copy_number_gain": if pathOrBen == "Pathogenic": - return "103,104,255" + return "17,44,138" else: - return "143,184,214" + return "122,165,211" def getMouseover(bed): """Return the mouseOver string for this bed record.""" ret = "" - ret += "%s:%s-%s" % (bed["chrom"], int(bed["chromStart"])+1, bed["chromEnd"]) + ret += "Gene(s) affected: %s" % (", ".join(bed["ClinGen"])) + ret += ", Position: %s:%s-%s" % (bed["chrom"], int(bed["chromStart"])+1, bed["chromEnd"]) ret += ", Size: %d" % (int(bed["chromEnd"]) - int(bed["chromStart"])) - ret += ", Gene(s) affected: %s" % (", ".join(bed["ClinGen"])) if bed["Variant Type"] == "copy_number_loss": ret += ", Type: loss" else: ret += ", Type: gain" ret += ", Significance: %s" % bed["Clinical Interpretation"] ret += ", Phenotype: %s" % bed["Phenotype"] return ret def dumpBedLines(): """Write out the bed lines, with the same number of fields in each line.""" fields = bed9Fields + extraFieldList + ["_mouseOver"] print("#%s" % ("\t".join(fields))) for b in bedLines: bed = bedLines[b] finalBed = [] for field in fields: try: if type(bed[field]) is list: if field == "ClinGen": urls = "" for ix in range(len(bed[field])): item = bed[field][ix] if item.startswith("ISCA"): urls += "https://dosage.clinicalgenome.org/clingen_region.cgi?id=" + item else: urls += "https://dosage.clinicalgenome.org/clingen_gene.cgi?sym=" + item if (ix + 1) < len(bed[field]): urls += ", " finalBed.append(urls) else: finalBed.append(", ".join(bed[field])) else: finalBed.append(str(bed[field])) except KeyError: # some of the extra fields won't exist for every record finalBed.append("") print("\t".join(finalBed)) def processExtraFields(extraFields): """Special processing of the GVF extra fields""" ret = {} for key in extraFields: val = extraFields[key] if key == "ID" or key == "Name": continue elif key == "Start Range" or key == "End Range": r = val.split(",") if len(r) != 2: ret[key] = val else: rangeStart = r[0] rangeEnd = r[1] if rangeStart == ".": rangeStart = "unknown" if rangeEnd == ".": rangeEnd = "unknown" if key == "Start Range": ret[key] = "outer start %s, inner start %s" % (rangeStart, rangeEnd) else: ret[key] = "inner end %s, outer end %s" % (rangeStart, rangeEnd) elif key == "Dbxref": splitxrefs = val.split(',') for xref in splitxrefs: db,v = xref.split(':') if db in ret: ret[db].append(v) else: ret[db] = [v] elif key == "Parent": ret["Variant Region"] = val else: ret[key] = val return ret def makeBedLine(chrom, chromStart, chromEnd, bedId, extraHash): """Turn a single gvf line into a bed 9+ line.""" #global extraFieldsSet bed = {} bed["chrom"] = chromLift[chrom] bed["chromStart"] = chromStart bed["chromEnd"] = chromEnd bed["name"] = extraHash["Name"] bed["score"] = 0 bed["strand"] = "." bed["thickStart"] = chromStart bed["thickEnd"] = chromEnd extra = processExtraFields(extraHash) bed.update(extra) #extraFieldsSet |= extra.keys() bed["itemRgb"] = getColor(bed["Variant Type"], bed["Clinical Interpretation"]) bed["_mouseOver"] = getMouseover(bed) bed["Size"] = chromEnd - chromStart return bed def fixupFieldName(key): """Capitalize field names and turn '_' to space.""" if key == "clinical_int": return "Clinical Interpretation" return " ".join(s[0].upper() + s[1:] for s in key.replace('_', ' ').split()) def processClinGenCnv(inf): global bedLines extraHash = {} # only keep "ssv" records, which effectively ignores the variant_region file nameMatch = re.compile("^[ne]sv\d") for line in inf: if line.startswith('#') or line.startswith('track') or line.startswith('browser'): continue trimmed = line.strip() fields = trimmed.split(maxsplit=8) extraFields = fields[-1].split(';') itemName = extraFields[1].split('=')[1] if nameMatch.match(itemName): continue for f in extraFields: k,v = f.strip().split('=') fixedName = fixupFieldName(k) #if fixedName not in extraFieldsSet: # extraFieldsSet.add(fixedName) extraHash[fixedName] = v bedId = itemName extraHash["Variant Type"] = fields[2] bedLines[bedId] = makeBedLine(fields[0], int(fields[3]) - 1, int(fields[4]), bedId, extraHash) dumpBedLines() def main(): args = setupCommandLine() if args.liftFile: parseLiftFile(args.liftFile) if args.infile == "stdin": processClinGenCnv(sys.stdin) else: with open(args.infile) as inf: processClinGenCnv(inf) if __name__=="__main__": main()