37befdf6676de0022c60966291e21e5977dba086 chmalee Fri Jun 19 13:46:31 2020 -0700 Adding clingen cnv and dosage otto job to source tree now that it's been running for a while, refs #25562 diff --git src/hg/utils/otto/clinGen/processClinGenDosage.py src/hg/utils/otto/clinGen/processClinGenDosage.py new file mode 100755 index 0000000..feb63e5 --- /dev/null +++ src/hg/utils/otto/clinGen/processClinGenDosage.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 + +""" +Read GVF lines from stdin and write bed9+ lines to stdout +""" + +import os,sys,re +import argparse + +bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", + "thickEnd", "itemRgb"] + +# we only use some of these fields depending on whether we are working with haplo or triplo scores +extraFieldList = ["Gene Symbol", "Gene ID", "cytoBand", "Genomic Location", + "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", + "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score", + "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", + "Triplosensitivity PMID3", "Date Last Evaluated", "Loss phenotype OMIM ID", + "Triplosensitive phenotype OMIM ID"] + +scoreExplanation = { + -1: "not yet evaluated", + 0: "no evidence for dosage pathogenicity", + 1: "little evidence for dosage pathogenicity", + 2: "some evidence for dosage pathogenicity", + 3: "sufficient evidence for dosage pathogenicity", + 30: "gene associated with autosomal recessive phenotype", + 40: "haploinsufficiency unlikely" + } + +# hold all the bedlines in memory until the end as different lines have +# different numbers of extra fields +bedLines = {} + +# the phenotypes and other stuff associated with a particular gene +extraInfo = {} + +def setupCommandLine(): + parser = argparse.ArgumentParser(description="Read bed lines from infile, join with extraFile and transform to bed9+", + add_help=True, usage = "%(prog)s [options]") + parser.add_argument("infile", action="store", default=None, help="Input bed file from which to read input, use 'stdin' to read from default stdin") + parser.add_argument("extraFile", action="store", default=None, help="a ClinGen_gene_curation_list file with extra info for the records in the infile") + parser.add_argument("dosageType", action="store", default=None, help="haplo or triplo, which extraFields to use for this bed record") + args = parser.parse_args() + if args.infile == "stdin" and args.extraFile == "stdin": + print("Only one of infile or extraFile can be stdin") + sys.exit(1) + return args + +def parseExtraFile(fname, dosageType): + global extraInfo + oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss" + fieldMatch = re.compile(oppMatchStr,re.I) + with open(fname) as f: + for line in f: + if line.startswith("#"): + continue + l = line.strip("\n").split("\t") + geneSymbol = l[0] + extra = l[1:] + try: + if extraInfo[geneSymbol]: + print("duplicate info for gene symbol: '%s'" % (geneSymbol)) + sys.exit(1) + except KeyError: + extraInfo[geneSymbol] = {} + for keyName,ix in zip(extraFieldList[1:],range(len(extraFieldList[1:]))): + try: + if fieldMatch.match(keyName): + continue + extraInfo[geneSymbol][keyName] = extra[ix] + except IndexError: + sys.exit(1) + +def getColor(bed, dosageType): + """Return the shade of the item according to it's variant type.""" + if dosageType == "haplo": + if bed["score"] == -1: return "128,128,128" + if bed["score"] == 0: return "191,191,191" + if bed["score"] == 1: return "191,115,115" + if bed["score"] == 2: return "191,93,96" + if bed["score"] == 3: return "255,51,0" + if bed["score"] == 30: return "160,136,246" + if bed["score"] == 40: return "255,204,204" + if dosageType == "triplo": + if bed["score"] == -1: return "128,128,128" + if bed["score"] == 0: return "191,191,191" + if bed["score"] == 1: return "180,198,231" + if bed["score"] == 2: return "142,169,219" + if bed["score"] == 3: return "48,84,150" + if bed["score"] == 30: return "160,136,246" + if bed["score"] == 40: return "217,255,242" + +def getMouseover(bed, dosageType): + """Return the mouseOver string for this bed record.""" + ret = "Gene: %s" % bed["name"] + if dosageType == "triplo": + ret += ", Triplosensitivity: " + else: + ret += ", Haplosensitivity: " + ret += "%s - %s" % (bed["score"], scoreExplanation[bed["score"]]) + return ret + +def dumpBedLines(dosageType): + """Write out the bed lines, with the same number of fields in each line.""" + oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss" + extraFields = [x for x in extraFieldList if not re.match(oppMatchStr, x, re.I) and x != "Gene Symbol" and x != "Genomic Location"] + fields = bed9Fields + extraFields+ ["_mouseOver"] + print("#%s" % ("\t".join(fields))) + for b in bedLines: + bed = bedLines[b] + finalBed = [] + for field in fields: + try: + if type(bed[field]) is list: + finalBed.append(", ".join(bed[field])) + else: + finalBed.append(str(bed[field])) + except KeyError: # some of the extra fields won't exist for every record + finalBed.append("") + print("\t".join(finalBed)) + +def makeBedLine(chrom, chromStart, chromEnd, name, score, dosageType): + """Turn a single gvf line into a bed 9+ line.""" + bed = {} + bed["chrom"] = chrom + bed["chromStart"] = chromStart + bed["chromEnd"] = chromEnd + bed["name"] = name + bed["score"] = score + bed["strand"] = "." + bed["thickStart"] = chromStart + bed["thickEnd"] = chromEnd + bed["itemRgb"] = getColor(bed, dosageType) + bed["Size"] = chromEnd - chromStart + extra = extraInfo[name] + bed.update(extra) + bed["_mouseOver"] = getMouseover(bed,dosageType) + return bed + +def processClinGenDosage(inf, dosageType): + global bedLines + for line in inf: + if line.startswith('#') or line.startswith('track') or line.startswith('browser'): + continue + trimmed = line.strip() + chrom, chromStart, chromEnd, name, score = trimmed.split("\t") + try: + bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, int(score), dosageType) + except: + if score.startswith("Not"): + bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, -1, dosageType) + else: + print(sys.exc_info()[0]) + sys.stderr.write("bad input line:\n%s\n" % line) + sys.exit(1) + dumpBedLines(dosageType) + +def main(): + args = setupCommandLine() + if args.extraFile: + parseExtraFile(args.extraFile, args.dosageType) + if args.infile == "stdin": + processClinGenDosage(sys.stdin,args.dosageType) + else: + with open(args.infile) as inf: + processClinGenDosage(inf, args.dosageType) + +if __name__=="__main__": + main()