89422546b1bfadf27a115ae774571a5f57a916d8 chmalee Tue Jan 18 14:37:33 2022 -0800 Re-do the v2.1.1 (hg19) and v3.1.1 (hg38) versions of the gnomad tracks after a user found we were incorrectly pre-filtering intergenic variants from the whole genome data sets, refs #28666 diff --git src/hg/makeDb/gnomad/gnomadVcfBedToBigBed src/hg/makeDb/gnomad/gnomadVcfBedToBigBed index 21b8544..30026e5 100755 --- src/hg/makeDb/gnomad/gnomadVcfBedToBigBed +++ src/hg/makeDb/gnomad/gnomadVcfBedToBigBed @@ -99,46 +99,46 @@ "v3.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", "_startPos", "displayName"], "v3.1_chrM": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", "_startPos", "displayName"], "v3.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", "_startPos", "displayName"], "v2.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", - "pLoF", "lofFlags", "lofCuration", "pLoFCurationFlags", "_startPos", "displayName"] + "_startPos", "displayName", "pLoF", "lofFlags", "lofCuration", "pLoFCurationFlags"] } # the fields in the extra tab file, specify order here so we have the same order # across different runs for different chromosomes versionExtraFields = { "v3.1.1": ["_key", "_jsonVep", "_jsonPopTable", "cadd_phred", "revel_score", "splice_ai_max_ds", "splice_ai_consequence", "primate_ai_score"], "v3.1_chrM": ["_key", "_jsonVep", "_jsonPopTable", "_jsonHapTable"], "v3.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "ami", "amr", "asj", "eas", "fin", "mid", "nfe", "sas", "oth", "XX", "XY"] for x in ["AC", "AN", "AF", "nhomalt"]], "cadd_phred", "revel_score", "splice_ai_max_ds", "splice_ai_consequence", "primate_ai_score"], - "v2.1.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "amr", - "asj", "eas", "fin", "nfe", "oth", "female", "male"] for x in ["AC", "AN", "AF", - "nhomalt"]]] + "v2.1.1": ["hgvsc", "hgvsp", "popmax", "AC_popmax", "AN_popmax", "AF_popmax", + *[x + "_" + y for y in ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", + "female", "male"] for x in ["AC", "AN", "AF", "nhomalt"]]] } # determines the color of the variant in the browser plofTypes = ["frameshift", "stop gained", "splice donor", "splice acceptor"] missenseTypes = ["missense", "inframe deletion", "inframe insertion", "start lost", "stop list"] synTypes = ["synonymous"] # for printing the name of the popmax: popAbbr = {"afr": "African/African American", "amr": "Latino/Admixed American", "asj": "Ashkenazi Jewish", "eas": "East Asian", "fin": "Finnish", "mid": "Middle Eastern", "ami": "Amish", "nfe": "Non-Finnish European", "sas": "South Asian", "oth": "Other (population not assigned)"} lofDesc = {"HC": "High-confidence", "OS": "Other Splice (beta)", "LC": "Low-confidence"} @@ -167,31 +167,31 @@ if args.extra_output_file and args.extra_output_file == "stdout" and args.extra_output_file == args.infile: sys.stderr.write("ERROR: Only one of infile and --extra-output-file can be stdout.") sys.exit(255) return args def writeHeaders(version, outfh, extrafh): """Write out the field names""" outfh.write("#" + "\t".join(x for x in versionAutoSql[version])) if extrafh: if version == "v3.1.1" or version == "v3.1_chrM": extrafh.write("#" + "\t".join(versionExtraFields["v3.1.1"] + ["_jsonHapTable"])) else: extrafh.write("#" + "\t".join(versionExtraFields[version])) extrafh.write("\n") else: - outfh.write("\t" + "#" + "\t".join(versionExtraFields[version])) + outfh.write("\t" + "\t".join(versionExtraFields[version])) outfh.write("\n") def parseHeader(headerStr, infh, outfh, extraFh): """Parse the field names out of a header string""" global header headerStr = headerStr[1:] # chop off '#' fields = headerStr.strip('\n').split('\t') if not header: header = fields outfh.write("#%s\n" % ("\t".join(header[:17] + ["rsID","genes","annot","variation_type","_startPos"] + ["hgvsc", "hgvsp"]+ header[18:]))) elif fields != header: sys.stderr.write("Header differs from others: '%s'\n" % infh.name) sys.exit(1) @@ -347,38 +347,38 @@ #assert(len(thisElem) == numFields) ret.append("|".join(thisElem)) return ret def parseVep(vep, version): """Return a more compacted version of the VEP annotations for a variant, organized by the affected genes""" genes = defaultdict(dict) annotList = splitVepField(vep, version) for annot in annotList: group = annot.split('|') if "&" in group[1]: consList = group[1].split('&') else: consList = [group[1]] - if "regulatory_region_variant" in consList \ - or "downstream_gene_variant" in consList \ - or "upstream_gene_variant" in consList \ - or "TF_binding_site_variant" in consList \ - or "intergenic_variant" in consList: - continue + #if "regulatory_region_variant" in consList \ + # or "downstream_gene_variant" in consList \ + # or "upstream_gene_variant" in consList \ + # or "TF_binding_site_variant" in consList \ + # or "intergenic_variant" in consList: + # continue vepFields = dict(zip(versionVepFields[version],group)) - gene = vepFields["SYMBOL"] + gene = vepFields["SYMBOL"] if vepFields["SYMBOL"] != "" else "N/A" hgvsc = [vepFields["HGVSc"]] if vepFields["HGVSc"] != "" else None hgvsp = [vepFields["HGVSp"]] if vepFields["HGVSp"] != "" else None lof = [vepFields["LoF"]] if vepFields["LoF"] != "" else None lofFlags = [vepFields["LoF_flags"]] if vepFields["LoF_flags"] != "" else None lofFilter = [vepFields["LoF_filter"]] if vepFields["LoF_filter"] != "" else None if lof: try: for i,x in enumerate(lof): if x == "LC": lof[i] = "Low Confidence (%s)" % lofFilter[i] else: lof[i] = lofDesc[x] except KeyError: sys.stderr.write("ERROR parsing lof information:\n") sys.stderr.write("lof: %s\n" % lof) @@ -431,31 +431,31 @@ maybeExtraFields = [x if x else "N/A" for x in fixedLine[18:]] else: vep = fixedLine[12] # ac and af are really AC_hom/AC_het and AF_hom/AF_het ac = fixedLine[14] + "/" + fixedLine[15] af = fixedLine[18] + "/" + fixedLine[19] an = fixedLine[13] faf95 = "" nhomalt = "" maybeExtraFields = fixedLine[16:18] + fixedLine[20:] if vep != "": genes = parseVep(vep, version) # Now that we've parsed all the incoming fields from vcfToBed, write # output to final bed, and maybe to extra tab file: - if genes: + #if genes: consList = [] for gene in genes: consList += list(genes[gene]["cons"]) annot,color = pickColor(consList) rsId = "" if bedName.startswith("rs"): rsId = bedName unshiftedStart = unshiftLeftPad(int(chromStart), ref, alt) displayName = chrom + "-" + str(unshiftedStart) + "-" displayName += ref[:10]+"..." if len(ref) > 13 else ref displayName += "-" displayName += alt[:10]+"..." if len(alt) > 13 else alt name = hashlib.md5(str.encode(chrom+chromStart+chromEnd+ref+alt)).hexdigest() if version == "v3.1.1" or version == "v3.1_chrM": outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, @@ -474,41 +474,51 @@ elif version == "v3.1": hgvscList = [", ".join(list(genes[gene]["hgvsc"]))] hgvspList = [", ".join(list(genes[gene]["hgvsp"]))] pLoFList = [", ".join(list(genes[gene]["pLoF"]))] pLoFFlags = [", ".join(list(genes[gene]["Flag"]))] outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, rsId, gene, annot, *consList, str(unshiftedStart), displayName])) if extraFh: extraFh.write("\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) extraFh.write("\n") else: outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) outfh.write("\n") else: - pLoFCuration = getLofCuration(lofDict, version, bed8Fields[0], + pLoFCuration = getLofCuration(lofDict, version, chrom, str(unshiftedStart), ref, alt) + hgvscList = [x for x in genes[gene]["hgvsc"] for gene in genes] + hgvspList = [x for x in genes[gene]["hgvsp"] for gene in genes] + pLoFList = [x for x in genes[gene]["pLoF"] for gene in genes] + pLoFFlags = [x for x in genes[gene]["Flag"] for gene in genes] outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, - rsId, gene, annot, *consList, displayName])) - outfh.write("\t" + "\t".join(pLoFCuration, [str(unshiftedStart)])) + rsId, ", ".join(genes.keys()), annot, ", ".join(consList)])) + outfh.write("\t" + str(unshiftedStart)) + outfh.write("\t" + displayName) + outfh.write("\t" + ", ".join(pLoFList)) + outfh.write("\t" + ", ".join(pLoFFlags)) + outfh.write("\t" + "\t".join(pLoFCuration)) if extraFh: - extraFh.write("\t".join(name + hgvscList + hgvspList + maybeExtraFields)) + extraFh.write(", ".join(hgvscList) + "\t" + ", ".join(hgvspList) + "\t" + "\t".join(maybeExtraFields)) extraFh.write("\n") else: - outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) + outfh.write("\t" + ", ".join(hgvscList)) + outfh.write("\t" + ", ".join(hgvspList)) + outfh.write("\t" + "\t".join(maybeExtraFields)) outfh.write("\n") def parseLofFile(fpath): """Make a struct of the different loss of function flags for a curated variant.""" gotHeader = False lofHeader = [] ret = {} with open(fpath) as fh: for line in fh: if not gotHeader: lofHeader = line.strip().split("\t") gotHeader = True else: lofDetails = line.strip().split("\t") ret[lofDetails[0]] = {lofHeader[x]: lofDetails[x] for x in range(len(lofHeader))}