89422546b1bfadf27a115ae774571a5f57a916d8 chmalee Tue Jan 18 14:37:33 2022 -0800 Re-do the v2.1.1 (hg19) and v3.1.1 (hg38) versions of the gnomad tracks after a user found we were incorrectly pre-filtering intergenic variants from the whole genome data sets, refs #28666 diff --git src/hg/makeDb/gnomad/gnomadVcfBedToBigBed src/hg/makeDb/gnomad/gnomadVcfBedToBigBed index 21b8544..30026e5 100755 --- src/hg/makeDb/gnomad/gnomadVcfBedToBigBed +++ src/hg/makeDb/gnomad/gnomadVcfBedToBigBed @@ -1,542 +1,552 @@ #!/cluster/software/bin/python3 """ Helper script to do some gnomAD specific stuff to the vcfToBed output NOTE: This script is dependent on the format of the VEP INFO field in a particular VCF file. Use the -v option to pass the right version options to this script, and if necessary, add a new one. Example format for the v2.1.1 version of gnomAD: ##INFO= """ import sys, argparse, hashlib,json from collections import defaultdict,namedtuple,OrderedDict # which version of gnomAD for parsing VEP string versions = ["v2.1.1", "v3.1", "v3.1_chrM", "v3.1.1"] # the number of fields in the VEP string (depends on version): # how to count: # bcftools view -h in.vcf.gz | grep "^##INFO= 13 else ref displayName += "-" displayName += alt[:10]+"..." if len(alt) > 13 else alt name = hashlib.md5(str.encode(chrom+chromStart+chromEnd+ref+alt)).hexdigest() if version == "v3.1.1" or version == "v3.1_chrM": outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, rsId, ", ".join(genes.keys()), annot, ",".join(consList), str(unshiftedStart), displayName])) popTable, otherFields = convertExtraFields(maybeExtraFields + [ac, an, af, nhomalt], version) if version == "v3.1_chrM": # otherFields is the haplotype table plus empties for the v3.1.1 fields: otherFields = ["" for i in range(5)] + [json.dumps(otherFields)] if extraFh: extraFh.write("\t".join([name, json.dumps(genes), json.dumps(popTable)] + otherFields)) extraFh.write("\n") else: outfh.write("\t" + "\t".join([name] + [json.dumps(genes), json.dumps(popTable)] + otherFields)) outfh.write("\n") elif version == "v3.1": hgvscList = [", ".join(list(genes[gene]["hgvsc"]))] hgvspList = [", ".join(list(genes[gene]["hgvsp"]))] pLoFList = [", ".join(list(genes[gene]["pLoF"]))] pLoFFlags = [", ".join(list(genes[gene]["Flag"]))] outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, rsId, gene, annot, *consList, str(unshiftedStart), displayName])) if extraFh: extraFh.write("\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) extraFh.write("\n") else: outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) outfh.write("\n") else: - pLoFCuration = getLofCuration(lofDict, version, bed8Fields[0], + pLoFCuration = getLofCuration(lofDict, version, chrom, str(unshiftedStart), ref, alt) + hgvscList = [x for x in genes[gene]["hgvsc"] for gene in genes] + hgvspList = [x for x in genes[gene]["hgvsp"] for gene in genes] + pLoFList = [x for x in genes[gene]["pLoF"] for gene in genes] + pLoFFlags = [x for x in genes[gene]["Flag"] for gene in genes] outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, - rsId, gene, annot, *consList, displayName])) - outfh.write("\t" + "\t".join(pLoFCuration, [str(unshiftedStart)])) + rsId, ", ".join(genes.keys()), annot, ", ".join(consList)])) + outfh.write("\t" + str(unshiftedStart)) + outfh.write("\t" + displayName) + outfh.write("\t" + ", ".join(pLoFList)) + outfh.write("\t" + ", ".join(pLoFFlags)) + outfh.write("\t" + "\t".join(pLoFCuration)) if extraFh: - extraFh.write("\t".join(name + hgvscList + hgvspList + maybeExtraFields)) + extraFh.write(", ".join(hgvscList) + "\t" + ", ".join(hgvspList) + "\t" + "\t".join(maybeExtraFields)) extraFh.write("\n") else: - outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) + outfh.write("\t" + ", ".join(hgvscList)) + outfh.write("\t" + ", ".join(hgvspList)) + outfh.write("\t" + "\t".join(maybeExtraFields)) outfh.write("\n") def parseLofFile(fpath): """Make a struct of the different loss of function flags for a curated variant.""" gotHeader = False lofHeader = [] ret = {} with open(fpath) as fh: for line in fh: if not gotHeader: lofHeader = line.strip().split("\t") gotHeader = True else: lofDetails = line.strip().split("\t") ret[lofDetails[0]] = {lofHeader[x]: lofDetails[x] for x in range(len(lofHeader))} return ret def main(): args = parseCommandLine() lofDict = {} lofFile = args.lofFilePath extraFh = None if lofFile: lofDict = parseLofFile(lofFile) if args.infile == "stdin": infh = sys.stdin else: infh = open(args.infile) if args.outfile == "stdout": outfh = sys.stdout else: outfh = open(args.outfile, "w") if args.extra_output_file: if args.extra_output_file == "stdout": extraFh = sys.stdout else: extraFh = open(args.extra_output_file, "w") gnomadVcfBedToBigBed(infh, outfh, extraFh, args.version, lofDict) infh.close() outfh.close() if __name__ == "__main__": main()