37befdf6676de0022c60966291e21e5977dba086 chmalee Fri Jun 19 13:46:31 2020 -0700 Adding clingen cnv and dosage otto job to source tree now that it's been running for a while, refs #25562 diff --git src/hg/utils/otto/clinGen/processClinGenCnv.py src/hg/utils/otto/clinGen/processClinGenCnv.py new file mode 100755 index 0000000..b867e9f --- /dev/null +++ src/hg/utils/otto/clinGen/processClinGenCnv.py @@ -0,0 +1,197 @@ +#!/usr/bin/env python3 + +""" +Read GVF lines from stdin and write bed9+ lines to stdout +""" + +import os,sys,re +import argparse + +bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", + "thickEnd", "itemRgb"] + +# the following list of fields may not exist for every record in the input file +extraFieldList= ["Size", "Alias", "Variant Region", "Start Range", "End Range", "Copy Number", "Variant Type", + "OMIM", "ClinGen", "Phenotype", "Phenotype Id", "PubMed", "Clinical Interpretation", + "Clinical Source", "Sample Name", "Sampleset Name"] #, "Variant Seq", "Reference Seq", "RemapScore"] +#extraFieldsSet = set() + +# hold all the bedlines in memory until the end as different lines have +# different numbers of extra fields +bedLines = {} +chromLift = {} + +def setupCommandLine(): + parser = argparse.ArgumentParser(description="Read GVF lines from infile and transform to bed9+", + add_help=True, usage = "%(prog)s [options]") + parser.add_argument("infile", action="store", default=None, help="Input GVF file from which to read input, use 'stdin' to read from default stdin") + parser.add_argument("liftFile", action="store", default=None, help="liftUp file for converting chrom names to UCSC style names") + args = parser.parse_args() + return args + +def parseLiftFile(fname): + global chromLift + with open(fname) as f: + for line in f: + l = line.strip().split() + ncbiChrom = l[1] + ucscChrom = l[3] + chromLift[ncbiChrom] = ucscChrom + +def getColor(cnvtype, pathOrBen): + """Return the shade of the item according to it's variant type.""" + if cnvtype not in ["copy_number_loss","copy_number_gain"] or pathOrBen not in ["Benign", "Pathogenic"]: + return "0,0,0" + if cnvtype == "copy_number_loss": + if pathOrBen == "Pathogenic": + return "255,104,104" + else: + return "231,154,144" + if cnvtype == "copy_number_gain": + if pathOrBen == "Pathogenic": + return "103,104,255" + else: + return "143,184,214" + +def getMouseover(bed): + """Return the mouseOver string for this bed record.""" + ret = "" + ret += "%s:%s-%s" % (bed["chrom"], int(bed["chromStart"])+1, bed["chromEnd"]) + ret += ", Size: %d" % (int(bed["chromEnd"]) - int(bed["chromStart"])) + ret += ", Gene(s) affected: %s" % (", ".join(bed["ClinGen"])) + if bed["Variant Type"] == "copy_number_loss": + ret += ", Type: loss" + else: + ret += ", Type: gain" + ret += ", Significance: %s" % bed["Clinical Interpretation"] + ret += ", Phenotype: %s" % bed["Phenotype"] + return ret + +def dumpBedLines(): + """Write out the bed lines, with the same number of fields in each line.""" + fields = bed9Fields + extraFieldList + ["_mouseOver"] + print("#%s" % ("\t".join(fields))) + for b in bedLines: + bed = bedLines[b] + finalBed = [] + for field in fields: + try: + if type(bed[field]) is list: + if field == "ClinGen": + urls = "" + for ix in range(len(bed[field])): + item = bed[field][ix] + if item.startswith("ISCA"): + urls += "https://dosage.clinicalgenome.org/clingen_region.cgi?id=" + item + else: + urls += "https://dosage.clinicalgenome.org/clingen_gene.cgi?sym=" + item + if (ix + 1) < len(bed[field]): + urls += ", " + finalBed.append(urls) + else: + finalBed.append(", ".join(bed[field])) + else: + finalBed.append(str(bed[field])) + except KeyError: # some of the extra fields won't exist for every record + finalBed.append("") + print("\t".join(finalBed)) + +def processExtraFields(extraFields): + """Special processing of the GVF extra fields""" + ret = {} + for key in extraFields: + val = extraFields[key] + if key == "ID" or key == "Name": + continue + elif key == "Start Range" or key == "End Range": + r = val.split(",") + if len(r) != 2: + ret[key] = val + else: + rangeStart = r[0] + rangeEnd = r[1] + if rangeStart == ".": + rangeStart = "unknown" + if rangeEnd == ".": + rangeEnd = "unknown" + if key == "Start Range": + ret[key] = "outer start %s, inner start %s" % (rangeStart, rangeEnd) + else: + ret[key] = "inner end %s, outer end %s" % (rangeStart, rangeEnd) + elif key == "Dbxref": + splitxrefs = val.split(',') + for xref in splitxrefs: + db,v = xref.split(':') + if db in ret: + ret[db].append(v) + else: + ret[db] = [v] + elif key == "Parent": + ret["Variant Region"] = val + else: + ret[key] = val + return ret + +def makeBedLine(chrom, chromStart, chromEnd, bedId, extraHash): + """Turn a single gvf line into a bed 9+ line.""" + #global extraFieldsSet + bed = {} + bed["chrom"] = chromLift[chrom] + bed["chromStart"] = chromStart + bed["chromEnd"] = chromEnd + bed["name"] = extraHash["Name"] + bed["score"] = 0 + bed["strand"] = "." + bed["thickStart"] = chromStart + bed["thickEnd"] = chromEnd + extra = processExtraFields(extraHash) + bed.update(extra) + #extraFieldsSet |= extra.keys() + bed["itemRgb"] = getColor(bed["Variant Type"], bed["Clinical Interpretation"]) + bed["_mouseOver"] = getMouseover(bed) + bed["Size"] = chromEnd - chromStart + return bed + +def fixupFieldName(key): + """Capitalize field names and turn '_' to space.""" + if key == "clinical_int": + return "Clinical Interpretation" + return " ".join(s[0].upper() + s[1:] for s in key.replace('_', ' ').split()) + +def processClinGenCnv(inf): + global bedLines + extraHash = {} + # only keep "ssv" records, which effectively ignores the variant_region file + nameMatch = re.compile("^[ne]sv\d") + for line in inf: + if line.startswith('#') or line.startswith('track') or line.startswith('browser'): + continue + trimmed = line.strip() + fields = trimmed.split(maxsplit=8) + extraFields = fields[-1].split(';') + itemName = extraFields[1].split('=')[1] + if nameMatch.match(itemName): + continue + for f in extraFields: + k,v = f.strip().split('=') + fixedName = fixupFieldName(k) + #if fixedName not in extraFieldsSet: + # extraFieldsSet.add(fixedName) + extraHash[fixedName] = v + bedId = itemName + extraHash["Variant Type"] = fields[2] + bedLines[bedId] = makeBedLine(fields[0], int(fields[3]) - 1, int(fields[4]), bedId, extraHash) + dumpBedLines() + +def main(): + args = setupCommandLine() + if args.liftFile: + parseLiftFile(args.liftFile) + if args.infile == "stdin": + processClinGenCnv(sys.stdin) + else: + with open(args.infile) as inf: + processClinGenCnv(inf) + +if __name__=="__main__": + main()