src/hg/makeDb/gnomad/gnomadVcfBedToBigBed 89422546b1bfadf27a115ae774571a5f57a916d8

89422546b1bfadf27a115ae774571a5f57a916d8
chmalee
  Tue Jan 18 14:37:33 2022 -0800
Re-do the v2.1.1 (hg19) and v3.1.1 (hg38) versions of the gnomad tracks after a user found we were incorrectly pre-filtering intergenic variants from the whole genome data sets, refs #28666

diff --git src/hg/makeDb/gnomad/gnomadVcfBedToBigBed src/hg/makeDb/gnomad/gnomadVcfBedToBigBed
index 21b8544..30026e5 100755
--- src/hg/makeDb/gnomad/gnomadVcfBedToBigBed
+++ src/hg/makeDb/gnomad/gnomadVcfBedToBigBed
@@ -99,46 +99,46 @@
     "v3.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand",
         "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC",
         "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type",
         "_startPos", "displayName"],
     "v3.1_chrM": ["chrom", "chromStart", "chromEnd", "name", "score", "strand",
         "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC",
         "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type",
         "_startPos", "displayName"],
     "v3.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand",
         "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC",
         "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type",
         "_startPos", "displayName"],
     "v2.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand",
         "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC",
         "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type",
-        "pLoF", "lofFlags", "lofCuration", "pLoFCurationFlags", "_startPos", "displayName"]
+        "_startPos", "displayName", "pLoF", "lofFlags", "lofCuration", "pLoFCurationFlags"]
 }
 
 # the fields in the extra tab file, specify order here so we have the same order
 # across different runs for different chromosomes
 versionExtraFields = {
     "v3.1.1": ["_key", "_jsonVep", "_jsonPopTable", "cadd_phred", "revel_score", "splice_ai_max_ds",
         "splice_ai_consequence", "primate_ai_score"],
     "v3.1_chrM": ["_key", "_jsonVep", "_jsonPopTable", "_jsonHapTable"],
     "v3.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "ami", "amr",
         "asj", "eas", "fin", "mid", "nfe", "sas", "oth", "XX", "XY"] for x in ["AC", "AN",
         "AF", "nhomalt"]], "cadd_phred", "revel_score", "splice_ai_max_ds",
         "splice_ai_consequence", "primate_ai_score"],
-    "v2.1.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "amr",
-        "asj", "eas", "fin", "nfe", "oth", "female", "male"] for x in ["AC", "AN", "AF",
-        "nhomalt"]]]
+    "v2.1.1": ["hgvsc", "hgvsp", "popmax", "AC_popmax", "AN_popmax", "AF_popmax",
+        *[x + "_" + y for y in ["afr", "amr", "asj", "eas", "fin", "nfe", "oth",
+        "female", "male"] for x in ["AC", "AN", "AF", "nhomalt"]]]
 }
 
 # determines the color of the variant in the browser
 plofTypes = ["frameshift", "stop gained", "splice donor", "splice acceptor"]
 missenseTypes = ["missense", "inframe deletion", "inframe insertion", "start lost", "stop list"]
 synTypes = ["synonymous"]
 
 # for printing the name of the popmax:
 popAbbr = {"afr": "African/African American", "amr": "Latino/Admixed American",
     "asj": "Ashkenazi Jewish", "eas": "East Asian", "fin": "Finnish",
     "mid": "Middle Eastern", "ami": "Amish",
     "nfe": "Non-Finnish European", "sas": "South Asian", "oth": "Other (population not assigned)"}
 
 lofDesc = {"HC": "High-confidence", "OS": "Other Splice (beta)", "LC": "Low-confidence"}
 
@@ -167,31 +167,31 @@
     if args.extra_output_file and args.extra_output_file == "stdout" and args.extra_output_file == args.infile:
         sys.stderr.write("ERROR: Only one of infile and --extra-output-file can be stdout.")
         sys.exit(255)
     return args
 
 def writeHeaders(version, outfh, extrafh):
     """Write out the field names"""
     outfh.write("#" + "\t".join(x for x in versionAutoSql[version]))
     if extrafh:
         if version == "v3.1.1" or version == "v3.1_chrM":
             extrafh.write("#" + "\t".join(versionExtraFields["v3.1.1"] + ["_jsonHapTable"]))
         else:
             extrafh.write("#" + "\t".join(versionExtraFields[version]))
         extrafh.write("\n")
     else:
-        outfh.write("\t" + "#" + "\t".join(versionExtraFields[version]))
+        outfh.write("\t" + "\t".join(versionExtraFields[version]))
     outfh.write("\n")
 
 def parseHeader(headerStr, infh, outfh, extraFh):
     """Parse the field names out of a header string"""
     global header
     headerStr = headerStr[1:] # chop off '#'
     fields = headerStr.strip('\n').split('\t')
     if not header:
         header = fields
         outfh.write("#%s\n" % ("\t".join(header[:17] + 
             ["rsID","genes","annot","variation_type","_startPos"] +
             ["hgvsc", "hgvsp"]+ header[18:])))
     elif fields != header:
         sys.stderr.write("Header differs from others: '%s'\n" % infh.name)
         sys.exit(1)
@@ -347,38 +347,38 @@
             #assert(len(thisElem) == numFields)
             ret.append("|".join(thisElem))
     return ret
 
 def parseVep(vep, version):
     """Return a more compacted version of the VEP annotations for a variant,
         organized by the affected genes"""
     genes = defaultdict(dict)
     annotList = splitVepField(vep, version)
     for annot in annotList:
         group = annot.split('|')
         if "&" in group[1]:
             consList = group[1].split('&')
         else:
             consList = [group[1]]
-        if "regulatory_region_variant" in consList \
-                or "downstream_gene_variant" in consList \
-                or "upstream_gene_variant" in consList \
-                or "TF_binding_site_variant" in consList \
-                or "intergenic_variant" in consList:
-            continue
+        #if "regulatory_region_variant" in consList \
+        #        or "downstream_gene_variant" in consList \
+        #        or "upstream_gene_variant" in consList \
+        #        or "TF_binding_site_variant" in consList \
+        #        or "intergenic_variant" in consList:
+        #    continue
         vepFields = dict(zip(versionVepFields[version],group)) 
-        gene = vepFields["SYMBOL"]
+        gene = vepFields["SYMBOL"] if vepFields["SYMBOL"] != "" else "N/A"
         hgvsc = [vepFields["HGVSc"]] if vepFields["HGVSc"] != "" else None
         hgvsp = [vepFields["HGVSp"]] if vepFields["HGVSp"] != "" else None
         lof = [vepFields["LoF"]] if vepFields["LoF"] != "" else None
         lofFlags = [vepFields["LoF_flags"]] if vepFields["LoF_flags"] != "" else None
         lofFilter = [vepFields["LoF_filter"]] if vepFields["LoF_filter"] != "" else None
         if lof:
             try:
                 for i,x in enumerate(lof):
                     if x == "LC":
                         lof[i] = "Low Confidence (%s)" % lofFilter[i]
                     else:
                         lof[i] = lofDesc[x]
             except KeyError:
                 sys.stderr.write("ERROR parsing lof information:\n")
                 sys.stderr.write("lof: %s\n" % lof)
@@ -431,31 +431,31 @@
             maybeExtraFields = [x if x else "N/A" for x in fixedLine[18:]]
         else:
             vep = fixedLine[12]
             # ac and af are really AC_hom/AC_het and AF_hom/AF_het
             ac = fixedLine[14] + "/" + fixedLine[15]
             af = fixedLine[18] + "/" + fixedLine[19]
             an = fixedLine[13]
             faf95 = ""
             nhomalt = ""
             maybeExtraFields = fixedLine[16:18] + fixedLine[20:]
         if vep != "":
             genes = parseVep(vep, version)
 
         # Now that we've parsed all the incoming fields from vcfToBed, write
         # output to final bed, and maybe to extra tab file:
-        if genes:
+        #if genes:
         consList = []
         for gene in genes:
             consList += list(genes[gene]["cons"])
         annot,color = pickColor(consList)
         rsId = ""
         if bedName.startswith("rs"):
             rsId = bedName
         unshiftedStart = unshiftLeftPad(int(chromStart), ref, alt)
         displayName = chrom + "-" + str(unshiftedStart) + "-"
         displayName += ref[:10]+"..." if len(ref) > 13 else ref
         displayName += "-"
         displayName += alt[:10]+"..." if len(alt) > 13 else alt
         name = hashlib.md5(str.encode(chrom+chromStart+chromEnd+ref+alt)).hexdigest()
         if version == "v3.1.1" or version == "v3.1_chrM":
             outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand,
@@ -474,41 +474,51 @@
         elif version == "v3.1":
             hgvscList = [", ".join(list(genes[gene]["hgvsc"]))]
             hgvspList = [", ".join(list(genes[gene]["hgvsp"]))]
             pLoFList = [", ".join(list(genes[gene]["pLoF"]))]
             pLoFFlags = [", ".join(list(genes[gene]["Flag"]))]
             outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand,
                 thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt,
                 rsId, gene, annot, *consList, str(unshiftedStart), displayName]))
             if extraFh:
                 extraFh.write("\t".join([name] + hgvscList + hgvspList + maybeExtraFields))
                 extraFh.write("\n")
             else:
                 outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields))
             outfh.write("\n")
         else:
-                pLoFCuration = getLofCuration(lofDict, version, bed8Fields[0],
+            pLoFCuration = getLofCuration(lofDict, version, chrom,
                 str(unshiftedStart), ref, alt)
+            hgvscList = [x for x in genes[gene]["hgvsc"] for gene in genes]
+            hgvspList = [x for x in genes[gene]["hgvsp"] for gene in genes]
+            pLoFList = [x for x in genes[gene]["pLoF"] for gene in genes]
+            pLoFFlags = [x for x in genes[gene]["Flag"] for gene in genes]
             outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand,
                 thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt,
-                    rsId, gene, annot, *consList, displayName]))
-                outfh.write("\t" + "\t".join(pLoFCuration, [str(unshiftedStart)]))
+                rsId, ", ".join(genes.keys()), annot, ", ".join(consList)]))
+            outfh.write("\t" + str(unshiftedStart))
+            outfh.write("\t" + displayName)
+            outfh.write("\t" + ", ".join(pLoFList))
+            outfh.write("\t" + ", ".join(pLoFFlags))
+            outfh.write("\t" + "\t".join(pLoFCuration))
             if extraFh:
-                    extraFh.write("\t".join(name + hgvscList + hgvspList + maybeExtraFields))
+                extraFh.write(", ".join(hgvscList) + "\t" +  ", ".join(hgvspList) + "\t" + "\t".join(maybeExtraFields))
                 extraFh.write("\n")
             else:
-                    outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields))
+                outfh.write("\t" + ", ".join(hgvscList))
+                outfh.write("\t" + ", ".join(hgvspList))
+                outfh.write("\t" + "\t".join(maybeExtraFields))
                 outfh.write("\n")
 
 def parseLofFile(fpath):
     """Make a struct of the different loss of function flags for a curated variant."""
     gotHeader = False
     lofHeader = []
     ret = {}
     with open(fpath) as fh:
         for line in fh:
             if not gotHeader:
                 lofHeader = line.strip().split("\t")
                 gotHeader = True
             else:
                 lofDetails = line.strip().split("\t")
                 ret[lofDetails[0]] = {lofHeader[x]: lofDetails[x] for x in range(len(lofHeader))}