4bd3913b145415c4d24d5c6b2fa9ee5b966257c5 chmalee Fri Jul 31 11:44:52 2020 -0700 More clinGen dosage changes after extrernal feedback, refs #24818 diff --git src/hg/utils/otto/clinGen/processClinGenDosage.py src/hg/utils/otto/clinGen/processClinGenDosage.py index acd48c7..c40a161 100755 --- src/hg/utils/otto/clinGen/processClinGenDosage.py +++ src/hg/utils/otto/clinGen/processClinGenDosage.py @@ -1,180 +1,217 @@ #!/usr/bin/env python3 """ -Read GVF lines from stdin and write bed9+ lines to stdout +Turn ClinGen gene_curation_list and region_curation_list into +two bed files, one for haploinsufficient and one for triplosensitive """ -import os,sys,re +import os,sys import argparse -bed9Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", - "thickEnd", "itemRgb"] +bed8Fields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", + "thickEnd"] # we only use some of these fields depending on whether we are working with haplo or triplo scores -extraFieldList = ["Gene Symbol", "Gene ID", "cytoBand", "Genomic Location", - "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", - "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score", - "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", - "Triplosensitivity PMID3", "Date Last Evaluated", "Loss phenotype OMIM ID", - "Triplosensitive phenotype OMIM ID"] +commonFields = ["cytoBand", "Genomic Location", "Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", "Haploinsufficiency PMID2", "Haploinsufficiency PMID3", "Triplosensitivity Score", "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", "Triplosensitivity PMID3", "Date Last Evaluated", "OMIM ID", "OMIM Description"] +haploFields = ["Haploinsufficiency Score", "Haploinsufficiency Description", "Haploinsufficiency PMID1", "Haploinsufficiency PMID2", "Haploinsufficiency PMID3"] +triploFields = ["Triplosensitivity Score", "Triplosensitivity Description", "Triplosensitivity PMID1", "Triplosensitivity PMID2", "Triplosensitivity PMID3"] +geneListFields = ["Gene Symbol", "Gene ID"] +regionListFields = ["ISCA ID", "ISCA Region Name"] + +# the gene and region files get merged: +combinedFields = ["Gene Symbol/ISCA ID", "Gene ID/ISCA Region Name"] + +clinGenGeneUrl = "https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen/clingen_gene.cgi?sym=" +clinGenIscaUrl = "https://www.ncbi.nlm.nih.gov/projects/dbvar/clingen/clingen_region.cgi?id=" scoreExplanation = { -1: "not yet evaluated", 0: "no evidence for dosage pathogenicity", 1: "little evidence for dosage pathogenicity", 2: "some evidence for dosage pathogenicity", 3: "sufficient evidence for dosage pathogenicity", 30: "gene associated with autosomal recessive phenotype", 40: "haploinsufficiency unlikely" } -# hold all the bedlines in memory until the end as different lines have -# different numbers of extra fields -bedLines = {} - -# the phenotypes and other stuff associated with a particular gene -extraInfo = {} +# the input data keyed by "Gene Symbol" or "ISCA ID", and further keyed by each field name +inData = {} def setupCommandLine(): - parser = argparse.ArgumentParser(description="Read bed lines from infile, join with extraFile and transform to bed9+", + parser = argparse.ArgumentParser(description="Transform ClinGen region and gene lists into a bed file", add_help=True, usage = "%(prog)s [options]") - parser.add_argument("infile", action="store", default=None, help="Input bed file from which to read input, use 'stdin' to read from default stdin") - parser.add_argument("extraFile", action="store", default=None, help="a ClinGen_gene_curation_list file with extra info for the records in the infile") - parser.add_argument("dosageType", action="store", default=None, help="haplo or triplo, which extraFields to use for this bed record") + parser.add_argument("regionFile", action="store", default=None, help="a ClinGen_region_curation_list file") + parser.add_argument("geneFile", action="store", default=None, help="a ClinGen_gene_curation_list file") + parser.add_argument("outFileBase", action="store", default=None, help="the base file name for the two output files: outFileBase.haplo.bed and outFileBase.triplo.bed will be created.") args = parser.parse_args() - if args.infile == "stdin" and args.extraFile == "stdin": - print("Only one of infile or extraFile can be stdin") + if args.regionFile == "stdin" and args.geneFile == "stdin": + print("Only one of regionFile or geneFile can be stdin") sys.exit(1) return args -def parseExtraFile(fname, dosageType): - global extraInfo - oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss" - fieldMatch = re.compile(oppMatchStr,re.I) - with open(fname) as f: - for line in f: - if line.startswith("#"): - continue - l = line.strip("\n").split("\t") - geneSymbol = l[0] - extra = l[1:] - try: - if extraInfo[geneSymbol]: - print("duplicate info for gene symbol: '%s'" % (geneSymbol)) - sys.exit(1) - except KeyError: - extraInfo[geneSymbol] = {} - for keyName,ix in zip(extraFieldList[1:],range(len(extraFieldList[1:]))): +def getColor(bed): + """Return the shade of the item according to ClinGen rules. Red or blue for score 3, everything + else gray.""" + retColors = {"haplo": "128,128,128", "triplo": "128,128,128"} + if bed["Haploinsufficiency Score"] == "3": retColors["haplo"] = "255,0,0" + if bed["Triplosensitivity Score"] == "3": retColors["triplo"] = "0,255,0" + return retColors + +def getMouseover(bed): + """Return the mouseOver strings for this bed record.""" + hapScore = bed["Haploinsufficiency Score"] + tripScore = bed["Triplosensitivity Score"] + ret = {"haplo": "", "triplo": ""} + mouseOver = "Gene/ISCA ID: %s" % bed["name"] + ret["triplo"] = mouseOver + ", Triplosensitivity: " + ret["triplo"] += "%s - %s" % (tripScore, scoreExplanation[int(tripScore)] if tripScore and tripScore != "Not yet evaluated" else -1) + ret["haplo"] = mouseOver + ", Haploinsufficiency: " + ret["haplo"] += "%s - %s" % (hapScore, scoreExplanation[int(hapScore)] if hapScore and hapScore != "Not yet evaluated" else -1) + return ret + +def parsePosition(posStr): + """Turn a chr:start-stop string into bed chrom, chromStart, chromEnd""" try: - if fieldMatch.match(keyName): - continue - extraInfo[geneSymbol][keyName] = extra[ix] + colonDelim = posStr.split(":") + chrom = colonDelim[0] + dashDelim = colonDelim[1].split("-") + start = int(dashDelim[0]) - 1 + end = int(dashDelim[1]) except IndexError: + sys.stderr.write("bad position string: %s\n" % posStr) sys.exit(1) + return (chrom, start, end) -def getColor(bed, dosageType): - """Return the shade of the item according to it's variant type.""" - if dosageType == "haplo": - if bed["score"] == -1: return "218,215,215" - if bed["score"] == 0: return "130,128,128" - if bed["score"] == 1: return "232,104,111" - if bed["score"] == 2: return "218,44,55" - if bed["score"] == 3: return "180,3,16" - if bed["score"] == 30: return "137,93,189" - if bed["score"] == 40: return "238,146,148" - if dosageType == "triplo": - if bed["score"] == -1: return "218,215,215" - if bed["score"] == 0: return "130,128,128" - if bed["score"] == 1: return "88,131,211" - if bed["score"] == 2: return "41,78,174" - if bed["score"] == 3: return "17,44,138" - if bed["score"] == 30: return "137,93,189" - if bed["score"] == 40: return "122,165,211" - -def getMouseover(bed, dosageType): - """Return the mouseOver string for this bed record.""" - ret = "Gene: %s" % bed["name"] - if dosageType == "triplo": - ret += ", Triplosensitivity: " +def makeBedLines(): + """Turn inData into bed 9+""" + bedList = [] + for key in inData: + data = inData[key] + bed = {} + chrom, start, end = parsePosition(data["Genomic Location"]) + bed["chrom"] = chrom + bed["chromStart"] = start + bed["chromEnd"] = end + bed["name"] = key + bed["score"] = 0 + bed["strand"] = "." + bed["thickStart"] = bed["chromStart"] + bed["thickEnd"] = bed["chromEnd"] + bed["Size"] = bed["chromEnd"] - bed["chromStart"] + bed["Gene Symbol/ISCA ID"] = key + extra = inData[key] + bed.update(extra) + bed["itemRgb"] = getColor(bed) # dict of colors + bed["_mouseOver"] = getMouseover(bed) # dict of mouseovers + bedList.append(bed) + return bedList + +def dumpBedLines(ofBase): + """For each line in makeBedLines(), write to two files: ofBase.haplo.bed and ofBase.triplo.bed""" + bedLines = makeBedLines() + combined = combinedFields + commonFields + extraFields = ["clinGen URL"] + [x for x in combined if x != "Genomic Location"] + + # some fields differ depending on which file we are writing to, so remove them: + for field in haploFields + triploFields: + extraFields.remove(field) + + if not ofBase.endswith("."): + ofBase += "." + haploOfh = ofBase + "haplo.bed" + triploOfh = ofBase + "triplo.bed" + haploFile = open(haploOfh, "w") + triploFile = open(triploOfh, "w") + print("#%s" % ("\t".join(bed8Fields + ["itemRgb"] + extraFields + haploFields + ["_mouseOver"])), file=haploFile) + print("#%s" % ("\t".join(bed8Fields + ["itemRgb"] + extraFields + triploFields + ["_mouseOver"])), file=triploFile) + for bed in bedLines: + finalBed = [] + haploBed = [] + triploBed = [] + + # first the standard bed fields + for field in bed8Fields: + if type(bed[field]) is list: + haploBed.append(bed[field]) + triploBed.append(", ".join(bed[field])) else: - ret += ", Haplosensitivity: " - ret += "%s - %s" % (bed["score"], scoreExplanation[bed["score"]]) - return ret + haploBed.append(str(bed[field])) + triploBed.append(str(bed[field])) -def dumpBedLines(dosageType): - """Write out the bed lines, with the same number of fields in each line.""" - oppMatchStr = "triplo" if dosageType == "haplo" else "haplo|loss" - extraFields = [x for x in extraFieldList if not re.match(oppMatchStr, x, re.I) and x != "Gene Symbol" and x != "Genomic Location"] - fields = bed9Fields + extraFields+ ["_mouseOver"] - print("#%s" % ("\t".join(fields))) - for b in bedLines: - bed = bedLines[b] - finalBed = [] - for field in fields: + # then the common extra fields: + haploBed.append(str(bed["itemRgb"]["haplo"])) + triploBed.append(str(bed["itemRgb"]["triplo"])) + for field in extraFields: try: + if field == "Gene ID/ISCA Region Name": + if "Gene ID" in bed: + haploBed.append(str(bed["Gene ID"])) + triploBed.append(str(bed["Gene ID"])) + else: + haploBed.append(str(bed["ISCA Region Name"])) + triploBed.append(str(bed["ISCA Region Name"])) + continue if type(bed[field]) is list: - finalBed.append(", ".join(bed[field])) + haploBed.append(", ".join(bed[field])) + triploBed.append(", ".join(bed[field])) else: - finalBed.append(str(bed[field])) - except KeyError: # some of the extra fields won't exist for every record - finalBed.append("") - print("\t".join(finalBed)) + haploBed.append(str(bed[field])) + triploBed.append(str(bed[field])) + except KeyError: + sys.stderr.write("ill formed bed:\n%s\n" % (bed)) + sys.exit(1) -def makeBedLine(chrom, chromStart, chromEnd, name, score, dosageType): - """Turn a single gvf line into a bed 9+ line.""" - bed = {} - bed["chrom"] = chrom - bed["chromStart"] = chromStart - bed["chromEnd"] = chromEnd - bed["name"] = name - bed["score"] = score - bed["strand"] = "." - bed["thickStart"] = chromStart - bed["thickEnd"] = chromEnd - bed["itemRgb"] = getColor(bed, dosageType) - bed["Size"] = chromEnd - chromStart - extra = extraInfo[name] - bed.update(extra) - bed["_mouseOver"] = getMouseover(bed,dosageType) - return bed - -def processClinGenDosage(inf, dosageType): - global bedLines - lineCount = 1 - for line in inf: - if line.startswith('#') or line.startswith('track') or line.startswith('browser'): - lineCount += 1 - continue - trimmed = line.strip() - try: - chrom, chromStart, chromEnd, name, score = trimmed.split("\t") - except ValueError: - sys.stderr.write("Error: ignoring ill formatted bed line %s:%d\n" % (inf.name, lineCount)) - lineCount += 1 + # then the differing extra fields: + haploBed += [bed[field] for field in haploFields] + [bed["_mouseOver"]["haplo"]] + triploBed += [bed[field] for field in triploFields] + [bed ["_mouseOver"]["triplo"]] + print("\t".join(haploBed), file=haploFile) + print("\t".join(triploBed), file=triploFile) + haploFile.close() + triploFile.close() + +def parseInFile(fname, fileType="gene"): + if fname != "stdin": + f = open(fname) + lineNumber = 1 + for line in f: + lineNumber += 1 + bizarreRecord = False + if line.startswith("#"): continue - try: - bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, int(score), dosageType) - except: - # error here comes from something to do with the associated gene_curation_list and not - # the dosage file itself - if score.startswith("Not"): - bedLines[name] = makeBedLine(chrom, int(chromStart), int(chromEnd), name, -1, dosageType) + l = line.strip("\n").split("\t") + key = l[0] # geneSymbol or ISCA id + extra = l[1:] + if key in inData: + print("duplicate info for gene symbol: '%s'" % (geneSymbol)) + sys.exit(1) else: - print(sys.exc_info()[0]) - sys.stderr.write("bad input line:\n%s\n" % line) + temp = {} + fileFields = [] + if fileType == "gene": + fileFields = geneListFields + commonFields + else: + fileFields = regionListFields + commonFields + for keyName,ix in zip(fileFields[1:],range(len(fileFields[1:]))): + try: + if keyName == "Genomic Location" and extra[ix] == "tbd": + bizarreRecord = True + temp[keyName] = extra[ix] + except IndexError: + sys.stderr.write("error line %d or file: %s\n" % (lineNumber, fname)) sys.exit(1) - lineCount += 1 - dumpBedLines(dosageType) + if not bizarreRecord: + if fileType == "gene": + temp["clinGen URL"] = clinGenGeneUrl + key + else: + temp["clinGen URL"] = clinGenIscaUrl + key + inData[key] = temp + f.close() def main(): args = setupCommandLine() - if args.extraFile: - parseExtraFile(args.extraFile, args.dosageType) - if args.infile == "stdin": - processClinGenDosage(sys.stdin,args.dosageType) - else: - with open(args.infile) as inf: - processClinGenDosage(inf, args.dosageType) + parseInFile(args.geneFile, "gene") + parseInFile(args.regionFile, "region") + dumpBedLines(args.outFileBase) if __name__=="__main__": main()