6d039ce16bfa7c8c38372807bf21155c86819dc2 chmalee Tue Jun 29 09:41:09 2021 -0700 Commiting script and makedoc for gnomAD v3.1.1 track diff --git src/hg/makeDb/gnomad/gnomadVcfBedToBigBed src/hg/makeDb/gnomad/gnomadVcfBedToBigBed index 39e34b7..21b8544 100755 --- src/hg/makeDb/gnomad/gnomadVcfBedToBigBed +++ src/hg/makeDb/gnomad/gnomadVcfBedToBigBed @@ -8,118 +8,202 @@ options to this script, and if necessary, add a new one. Example format for the v2.1.1 version of gnomAD: ##INFO=<ID=vep,Number=.,Type=String,Description="Consequence annotations from Ensembl VEP. \ Format: \ Allele|Consequence|IMPACT|SYMBOL|Gene|Feature_type|Feature|BIOTYPE|EXON|INTRON|HGVSc\ |HGVSp|cDNA _position|CDS_position|Protein_position|Amino_acids|Codons\ |Existing_variation|ALLELE_NUM|DISTANCE|STRAND|FLAGS|VARIANT_CLASS|MINIMISED\ |SYMBOL_SOURCE|HGNC_ID|CANONICAL|TSL|APPRIS|CCDS|ENSP|SWISSPROT|TREMBL|UNIPARC\ |GENE_PHENO|SIFT|PolyPhen|DOMAINS|HGVS_OFFSET|GMAF|AFR_MAF|AMR_MAF|EAS_MAF|EUR_MAF\ |SAS_MAF|AA_MAF|EA_MAF|ExAC_MAF|ExAC_Adj_MAF|ExAC_AFR_MAF|ExAC_AMR_MAF|ExAC_EAS_MAF\ |ExAC_FIN_MAF|ExAC_NFE_MAF|ExAC_OTH_MAF|ExAC_SAS_MAF|CLIN_SIG|SOMATIC|PHENO|PUBMED\ |MOTIF_NAME|MOTIF_POS|HIGH_INF_POS|MOTIF_SCORE_CHANGE|LoF|LoF_filter|LoF_flags|LoF_info"> """ -import sys, argparse -from collections import defaultdict,namedtuple +import sys, argparse, hashlib,json +from collections import defaultdict,namedtuple,OrderedDict # which version of gnomAD for parsing VEP string -versions = ["v2.1.1", "v3.1"] +versions = ["v2.1.1", "v3.1", "v3.1_chrM", "v3.1.1"] # the number of fields in the VEP string (depends on version): # how to count: # bcftools view -h in.vcf.gz | grep "^##INFO=<ID=vep" | grep -o "Format:.*" \ # | tr '|' '\t' | tl0 | wc -l -numVepFields = {"v2.1.1" : 68, "v3.1": 45} +numVepFields = {"v2.1.1" : 68, "v3.1": 45, "v3.1_chrM": 45, "v3.1.1": 45} # the different pipe separated fields in the VEP struct # how to get: # bcftools view -h in.vcf.gz | grep "^##INFO=<ID=vep" | grep -o "Format: .*" \ # | cut -d' ' -f2- | cut -d'"' -f1 | sed -e 's/^/"/' -e 's/$/"/' -e 's/|/", "/g' versionVepFields = { + "v3.1.1": [ + "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type", "Feature", + "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position", "CDS_position", + "Protein_position", "Amino_acids", "Codons", "ALLELE_NUM", "DISTANCE", "STRAND", + "VARIANT_CLASS", "MINIMISED", "SYMBOL_SOURCE", "HGNC_ID", "CANONICAL", "TSL", + "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "GENE_PHENO", + "SIFT", "PolyPhen", "DOMAINS", "HGVS_OFFSET", "MOTIF_NAME", "MOTIF_POS", + "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "LoF", "LoF_filter", "LoF_flags", + "LoF_info" + ], + "v3.1_chrM": [ + "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type", "Feature", + "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position", "CDS_position", + "Protein_position", "Amino_acids", "Codons", "ALLELE_NUM", "DISTANCE", "STRAND", + "VARIANT_CLASS", "MINIMISED", "SYMBOL_SOURCE", "HGNC_ID", "CANONICAL", "TSL", + "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "GENE_PHENO", + "SIFT", "PolyPhen", "DOMAINS", "HGVS_OFFSET", "MOTIF_NAME", "MOTIF_POS", + "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "LoF", "LoF_filter", "LoF_flags", + "LoF_info" + ], + "v3.1": [ + "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type", "Feature", + "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position", "CDS_position", + "Protein_position", "Amino_acids", "Codons", "ALLELE_NUM", "DISTANCE", "STRAND", + "VARIANT_CLASS", "MINIMISED", "SYMBOL_SOURCE", "HGNC_ID", "CANONICAL", "TSL", + "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "GENE_PHENO", + "SIFT", "PolyPhen", "DOMAINS", "HGVS_OFFSET", "MOTIF_NAME", "MOTIF_POS", + "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "LoF", "LoF_filter", "LoF_flags", + "LoF_info" + ], "v2.1.1": [ "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type", "Feature", "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position", "CDS_position", "Protein_position", "Amino_acids", "Codons", "Existing_variation", "ALLELE_NUM", "DISTANCE", "STRAND", "FLAGS", "VARIANT_CLASS", "MINIMISED", "SYMBOL_SOURCE", "HGNC_ID", "CANONICAL", "TSL", "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "GENE_PHENO", "SIFT", "PolyPhen", "DOMAINS", "HGVS_OFFSET", "GMAF", "AFR_MAF", "AMR_MAF", "EAS_MAF", "EUR_MAF", "SAS_MAF", "AA_MAF", "EA_MAF", "ExAC_MAF", "ExAC_Adj_MAF", "ExAC_AFR_MAF", "ExAC_AMR_MAF", "ExAC_EAS_MAF", "ExAC_FIN_MAF", "ExAC_NFE_MAF", "ExAC_OTH_MAF", "ExAC_SAS_MAF", "CLIN_SIG", "SOMATIC", "PHENO", "PUBMED", "MOTIF_NAME", "MOTIF_POS", "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "LoF", "LoF_filter", "LoF_flags", "LoF_info" - ], - "v3.1": [ - "Allele", "Consequence", "IMPACT", "SYMBOL", "Gene", "Feature_type", "Feature", - "BIOTYPE", "EXON", "INTRON", "HGVSc", "HGVSp", "cDNA_position", "CDS_position", - "Protein_position", "Amino_acids", "Codons", "ALLELE_NUM", "DISTANCE", "STRAND", - "VARIANT_CLASS", "MINIMISED", "SYMBOL_SOURCE", "HGNC_ID", "CANONICAL", "TSL", - "APPRIS", "CCDS", "ENSP", "SWISSPROT", "TREMBL", "UNIPARC", "GENE_PHENO", - "SIFT", "PolyPhen", "DOMAINS", "HGVS_OFFSET", "MOTIF_NAME", "MOTIF_POS", - "HIGH_INF_POS", "MOTIF_SCORE_CHANGE", "LoF", "LoF_filter", "LoF_flags", - "LoF_info" ] } +versionPops = { + "v3.1.1": ["afr", "ami", "amr", "asj", "eas", "fin", "mid", "nfe", "sas", "oth", "XX", "XY"], + "v3.1_chrM": ['afr', 'ami', 'amr', 'asj', 'eas', 'fin', 'nfe', 'oth', 'sas', 'mid'], + "v3.1": ["afr", "amr", "asj", "eas", "fin", "mid", "ami", "nfe", "sas", "oth", "XX", "XY"], + "v2.1.1":["afr", "amr", "asj", "eas", "fin", "nfe", "sas", "oth"] +} + +chrM_haplo_groups = [ + 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'HV', 'I', 'J', 'K', 'L0', 'L1', 'L2', + 'L3', 'L4', 'L5', 'M', 'N', 'P', 'R', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z' +] + +# the fields that MUST be in the bigBed (for filters, etc) +versionAutoSql = { + "v3.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", + "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", + "_startPos", "displayName"], + "v3.1_chrM": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", + "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", + "_startPos", "displayName"], + "v3.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", + "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", + "_startPos", "displayName"], + "v2.1.1": ["chrom", "chromStart", "chromEnd", "name", "score", "strand", + "thickStart", "thickEnd", "itemRgb", "ref", "alt", "FILTER", "AC", + "AN", "AF", "faf95", "nhomalt", "rsId", "genes", "annot", "variation_type", + "pLoF", "lofFlags", "lofCuration", "pLoFCurationFlags", "_startPos", "displayName"] +} + +# the fields in the extra tab file, specify order here so we have the same order +# across different runs for different chromosomes +versionExtraFields = { + "v3.1.1": ["_key", "_jsonVep", "_jsonPopTable", "cadd_phred", "revel_score", "splice_ai_max_ds", + "splice_ai_consequence", "primate_ai_score"], + "v3.1_chrM": ["_key", "_jsonVep", "_jsonPopTable", "_jsonHapTable"], + "v3.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "ami", "amr", + "asj", "eas", "fin", "mid", "nfe", "sas", "oth", "XX", "XY"] for x in ["AC", "AN", + "AF", "nhomalt"]], "cadd_phred", "revel_score", "splice_ai_max_ds", + "splice_ai_consequence", "primate_ai_score"], + "v2.1.1": ["hgvsc", "hgvsp", "popmax", *[x + "_" + y for y in ["popmax", "afr", "amr", + "asj", "eas", "fin", "nfe", "oth", "female", "male"] for x in ["AC", "AN", "AF", + "nhomalt"]]] +} + # determines the color of the variant in the browser plofTypes = ["frameshift", "stop gained", "splice donor", "splice acceptor"] missenseTypes = ["missense", "inframe deletion", "inframe insertion", "start lost", "stop list"] synTypes = ["synonymous"] # for printing the name of the popmax: popAbbr = {"afr": "African/African American", "amr": "Latino/Admixed American", "asj": "Ashkenazi Jewish", "eas": "East Asian", "fin": "Finnish", "mid": "Middle Eastern", "ami": "Amish", "nfe": "Non-Finnish European", "sas": "South Asian", "oth": "Other (population not assigned)"} lofDesc = {"HC": "High-confidence", "OS": "Other Splice (beta)", "LC": "Low-confidence"} header = None def parseCommandLine(): parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="""Transform gnomAD VCF to bigBed. This looks something like: vcfToBed -fields="list,of,info,fields" in.vcf.gz firstOut.bed gnomadVcfBedToBigBed -v v2.1.1 -lof lofFile.txt firstOut.bed finalOut.bed""", ) parser.add_argument("infile", help="Input bed file with a vep field, use 'stdin' to read \ from stdin") parser.add_argument("-lof", "--lofFilePath", action="store", help="Path to tab separated \ loss-of-function curation file, where the first line is a header describing each \ of the fields") parser.add_argument("-v", "--version", action="store", help="Version of gnomAD. Each \ version of gnomAD tends to have slightly different VEP annotations or populations, \ so you can specify what you are working with here. Current supported versions: %s" % ", ".join(versions)) + parser.add_argument("-e", "--extra-output-file", action="store", help="Output fields not necessary for display to this extra file instead") parser.add_argument("outfile", help="Output bed file name, use 'stdout' to write to stdout") args = parser.parse_args() if not args.version or args.version not in versions: sys.stderr.write("ERROR: missing version or wrong version. Please supply -v VERSION where VERSION is one of: [%s]\n" % ", ".join(versions)) sys.exit(255) + if args.extra_output_file and args.extra_output_file == "stdout" and args.extra_output_file == args.infile: + sys.stderr.write("ERROR: Only one of infile and --extra-output-file can be stdout.") + sys.exit(255) return args -def parseHeader(headerStr, infh, outfh): +def writeHeaders(version, outfh, extrafh): + """Write out the field names""" + outfh.write("#" + "\t".join(x for x in versionAutoSql[version])) + if extrafh: + if version == "v3.1.1" or version == "v3.1_chrM": + extrafh.write("#" + "\t".join(versionExtraFields["v3.1.1"] + ["_jsonHapTable"])) + else: + extrafh.write("#" + "\t".join(versionExtraFields[version])) + extrafh.write("\n") + else: + outfh.write("\t" + "#" + "\t".join(versionExtraFields[version])) + outfh.write("\n") + +def parseHeader(headerStr, infh, outfh, extraFh): """Parse the field names out of a header string""" global header headerStr = headerStr[1:] # chop off '#' fields = headerStr.strip('\n').split('\t') if not header: header = fields outfh.write("#%s\n" % ("\t".join(header[:17] + - ["rsID","genes","annot","variation_type","hgvs_c","hgvs_p", "pLoF"] - + header[18:] + ["_pos"]))) + ["rsID","genes","annot","variation_type","_startPos"] + + ["hgvsc", "hgvsp"]+ header[18:]))) elif fields != header: sys.stderr.write("Header differs from others: '%s'\n" % infh.name) sys.exit(1) def pickColor(consequenceList): """Assign a color to this item based on it's predicted consequences""" cleanedList = [x.replace("variant","").lower().replace("_", " ").strip() for x in consequenceList] if any(item in plofTypes for item in cleanedList): return "pLoF", "255,32,0" elif any(item in missenseTypes for item in cleanedList): return "missense", "247,189,0" elif any(item in synTypes for item in cleanedList): return "synonymous", "4,255,0" else: return "other", "95,95,95" @@ -150,74 +234,148 @@ lofCurationStrs = { "likely_lof": "Curated as Likely LoF", "likely_not_lof": "Curated as Likely Not LoF", "lof": "Curated as LoF", "not_lof": "Curated as Not LoF", "uncertain": "Curated as Uncertain" } def getLofCuration(lofDict, version, ucscChrom, unshiftedStart, ref, alt): """Return the lof curation info for a variant, or an empty list""" key = getLofCurationKey(version, ucscChrom, unshiftedStart, ref, alt) curation = "" flags = "" if key in lofDict: curation = lofDict[key]["Verdict"] if curation != "lof": flags = flagsForVar(lofDict, key) curation = lofCurationStrs[curation] return [curation, flags] +def buildMitoStructs(versionPopNames, extraFieldList): + """Turn the chrM specific fields into the data structures we want to show""" + hapDict = OrderedDict() + popDict = OrderedDict() + i = 0 + hapDict["Haplogroup"] = ["Allele Number", "Homoplasmic AC", "Homoplasmic AF", + "Heteroplasmic AC", "Heteroplasmic AF"] + hap_an = extraFieldList[0].split("|") + hap_ac_het = extraFieldList[1].split("|") + hap_ac_hom = extraFieldList[2].split("|") + hap_af_hom = extraFieldList[3].split("|") + hap_af_het = extraFieldList[4].split("|") + for ix, hapName in enumerate(chrM_haplo_groups): + hapDict[hapName] = [hap_an[ix], hap_ac_hom[ix], hap_af_hom[ix], hap_ac_het[ix], hap_af_het[ix]] + hap_an = sum([int(x) for x in hap_an]) + total_hap_ac_hom = sum([int(x) for x in hap_ac_hom]) + total_hap_ac_het = sum([int(x) for x in hap_ac_het]) + hapDict["Total"] = [hap_an, total_hap_ac_hom, total_hap_ac_hom/hap_an, total_hap_ac_het, total_hap_ac_het/hap_an] + pop_an = extraFieldList[9].split("|") + pop_ac_het = extraFieldList[10].split("|") + pop_ac_hom = extraFieldList[11].split("|") + pop_af_hom = extraFieldList[12].split("|") + pop_af_het = extraFieldList[13].split("|") + popDict["Populations"] = ["Allele Number", "Homoplasmic AC", "Homoplasmic AF", + "Heteroplasmic AC", "Heteroplasmic AF"] + for ix, popName in enumerate(versionPopNames): + popDict[popName] = [pop_an[ix], pop_ac_hom[ix], pop_af_hom[ix], pop_ac_het[ix], pop_af_het[ix]] + hom_af, het_af = extraFieldList[16].split("/") + hom_ac, het_ac = extraFieldList[14].split("/") + popDict["Total"] = [extraFieldList[15], hom_ac, hom_af, het_ac, het_af] + other = hapDict + return popDict, other + +def convertExtraFields(extraFieldList, version): + """Split the population data from the other extra fields and return the population + data as a json blob and the other fields as a list""" + versionPopNames = [] + for x in versionPops[version]: + if x in popAbbr: + versionPopNames.append(popAbbr[x]) + else: + versionPopNames.append(x) + if version == "v3.1.1": + versionPopNames.append("Total") + popData = extraFieldList[4:-9] + extraFieldList[-4:] + popDict = OrderedDict() # for keeping the header line first and total line last + # in the final bigBed + popDict["Populations"] = ["Allele Count", "Allele Number", "Allele Frequency", + "Number of Homozygotes"] + i = 0 + for popName in versionPopNames: + val = popData[i:i+4] + key = popAbbr[popName] if popName in popAbbr else popName + popDict[key] = val + i += 4 + # Add an extra field for the chrM only haplogroup table + other = extraFieldList[-9:-4] + [""] + else: + popDict, other = buildMitoStructs(versionPopNames, extraFieldList) + return popDict,other def splitVepField(vep, version): - """Commas both delimit the multiple annotations of a single variant, + """Split the VEP string into a list of strings for each annotation. + Because commas both delimit the multiple annotations of a single variant, and can be part of the pipe separated strings that make up a - single annotation, so can't just split on ',' like normal""" + single annotation, we can't just split on ',' like normal""" ret = [] - # at the 46th '|', go back until we find a comma and replace that comma - # with a tab and then split on tab: + # at the versionSpecific index '|', go back until we find a comma or | and split there ix = 1 copy = vep.split('|') try: numFields = numVepFields[version] except ValueError: sys.stderr.write("ERROR: version '%s' not a supported version. Currently \ supported versions: [%s]\n" % ", ".join(versions)) sys.exit(255) for i in range(round(len(copy) / numFields)): + start = 0 + # I think there is a bug in the version of VEP used to annotate chrM where + # the comma separating annotations is stripped, leaving the indexing off by one + if version == "v3.1_chrM": + start = i*numFields + else: start = (i*numFields) - (i*1) end = numFields + (numFields*i) - i + #print("start = %d, end = %d" % (start,end)) thisElem = copy[start:end] if thisElem: - # fix up the first element which will contain part of the previous annotation + # fix up the first element which may contain part of the previous annotation + #print("1. thisElem[0]: %s, thisElem[-1]: %s" % (thisElem[0], thisElem[-1])) if i != 0 and ',' in thisElem[0]: thisElem[0] = thisElem[0].split(',')[-1] - # the last element will always contain part of the next annotation + #print("2. thisElem[0]: %s, thisElem[-1]: %s" % (thisElem[0], thisElem[-1])) + # NON-chrM: the last element will always contain part of the next annotation + # if the last elem was empty then it is just the start of the next annotation + if thisElem[-1]: thisElem[-1] = ",".join(thisElem[-1].split(',')[:-1]) + #print("3. thisElem[0]: %s, thisElem[-1]: %s" % (thisElem[0], thisElem[-1])) + #assert(len(thisElem) == numFields) ret.append("|".join(thisElem)) return ret def parseVep(vep, version): """Return a more compacted version of the VEP annotations for a variant, organized by the affected genes""" genes = defaultdict(dict) annotList = splitVepField(vep, version) for annot in annotList: group = annot.split('|') if "&" in group[1]: consList = group[1].split('&') else: consList = [group[1]] if "regulatory_region_variant" in consList \ or "downstream_gene_variant" in consList \ or "upstream_gene_variant" in consList \ - or "TF_binding_site_variant" in consList: + or "TF_binding_site_variant" in consList \ + or "intergenic_variant" in consList: continue vepFields = dict(zip(versionVepFields[version],group)) gene = vepFields["SYMBOL"] hgvsc = [vepFields["HGVSc"]] if vepFields["HGVSc"] != "" else None hgvsp = [vepFields["HGVSp"]] if vepFields["HGVSp"] != "" else None lof = [vepFields["LoF"]] if vepFields["LoF"] != "" else None lofFlags = [vepFields["LoF_flags"]] if vepFields["LoF_flags"] != "" else None lofFilter = [vepFields["LoF_filter"]] if vepFields["LoF_filter"] != "" else None if lof: try: for i,x in enumerate(lof): if x == "LC": lof[i] = "Low Confidence (%s)" % lofFilter[i] else: lof[i] = lofDesc[x] @@ -231,99 +389,154 @@ genes[gene]["cons"].update(list(consList)) if hgvsc: genes[gene]["hgvsc"].update(hgvsc) if hgvsp: genes[gene]["hgvsp"].update(hgvsp) if lof: genes[gene]["pLoF"] = set(lof) if lofFlags: genes[gene]["Flag"] = set(lofFlags) else: genes[gene]["cons"] = set(consList) genes[gene]["hgvsc"] = set(hgvsc) if hgvsc else set() genes[gene]["hgvsp"] = set(hgvsp) if hgvsp else set() genes[gene]["pLoF"] = set(lof) if lof else set() genes[gene]["Flag"] = set(lof) if lofFlags else set() + for gene in genes: + genes[gene]["cons"] = list(genes[gene]["cons"]) + genes[gene]["hgvsc"] = list(genes[gene]["hgvsc"]) + genes[gene]["hgvsp"] = list(genes[gene]["hgvsp"]) + genes[gene]["pLoF"] = list(genes[gene]["pLoF"]) + genes[gene]["Flag"] = list(genes[gene]["Flag"]) return genes -def gnomadVcfBedToBigBed(infh, outfh, version, lofDict): +def gnomadVcfBedToBigBed(infh, outfh, extraFh, version, lofDict): """Read from already opened infh, convert to more compact bed format, then write - to already opened outfh.""" + to already opened outfh, optionally use extraFh for non-necessary bed fields.""" + writeHeaders(version, outfh, extraFh) for line in infh: genes = defaultdict(dict) extraInfo = [] if line.startswith("#"): - parseHeader(line, infh, outfh) continue + #parseHeader(line, infh, outfh, extraFh) + #continue fixedLine = line.strip('\n').split('\t') - vep = fixedLine[17] - bed8Fields = fixedLine[:8] + chrom, chromStart, chromEnd, bedName, score, strand, thickStart, thickEnd = fixedLine[:8] color = fixedLine[8] # defaults to black - firstExtra = fixedLine[9:17] - filterTag = firstExtra[2] - firstExtra[2] = filterTag.replace(";",",") # replace so trackDb filterValues works - savedPostFields = [x if x else "N/A" for x in fixedLine[18:]] - savedPostFields[0] = popAbbr[savedPostFields[0]] if savedPostFields[0] in popAbbr else 'N/A' - for ix in range(len(savedPostFields)): - if not savedPostFields[ix]: - savedPostFields[ix] = "N/A" + ref, alt, filterTag = fixedLine[9:12] + filterTag = filterTag.replace(";",",") # replace so trackDb filterValues works + if "chrM" not in version: + vep = fixedLine[17] + ac, an, af, faf95, nhomalt = fixedLine[12:17] + maybeExtraFields = [x if x else "N/A" for x in fixedLine[18:]] + else: + vep = fixedLine[12] + # ac and af are really AC_hom/AC_het and AF_hom/AF_het + ac = fixedLine[14] + "/" + fixedLine[15] + af = fixedLine[18] + "/" + fixedLine[19] + an = fixedLine[13] + faf95 = "" + nhomalt = "" + maybeExtraFields = fixedLine[16:18] + fixedLine[20:] if vep != "": genes = parseVep(vep, version) + + # Now that we've parsed all the incoming fields from vcfToBed, write + # output to final bed, and maybe to extra tab file: + if genes: + consList = [] for gene in genes: - annot,color = pickColor(genes[gene]["cons"]) - consList = [",".join(list(genes[gene]["cons"]))] + consList += list(genes[gene]["cons"]) + annot,color = pickColor(consList) + rsId = "" + if bedName.startswith("rs"): + rsId = bedName + unshiftedStart = unshiftLeftPad(int(chromStart), ref, alt) + displayName = chrom + "-" + str(unshiftedStart) + "-" + displayName += ref[:10]+"..." if len(ref) > 13 else ref + displayName += "-" + displayName += alt[:10]+"..." if len(alt) > 13 else alt + name = hashlib.md5(str.encode(chrom+chromStart+chromEnd+ref+alt)).hexdigest() + if version == "v3.1.1" or version == "v3.1_chrM": + outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, + thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, + rsId, ", ".join(genes.keys()), annot, ",".join(consList), str(unshiftedStart), displayName])) + popTable, otherFields = convertExtraFields(maybeExtraFields + [ac, an, af, nhomalt], version) + if version == "v3.1_chrM": + # otherFields is the haplotype table plus empties for the v3.1.1 fields: + otherFields = ["" for i in range(5)] + [json.dumps(otherFields)] + if extraFh: + extraFh.write("\t".join([name, json.dumps(genes), json.dumps(popTable)] + otherFields)) + extraFh.write("\n") + else: + outfh.write("\t" + "\t".join([name] + [json.dumps(genes), json.dumps(popTable)] + otherFields)) + outfh.write("\n") + elif version == "v3.1": hgvscList = [", ".join(list(genes[gene]["hgvsc"]))] hgvspList = [", ".join(list(genes[gene]["hgvsp"]))] pLoFList = [", ".join(list(genes[gene]["pLoF"]))] pLoFFlags = [", ".join(list(genes[gene]["Flag"]))] - rsId = "" - name = bed8Fields[3] - if bed8Fields[3].startswith("rs"): - rsId = bed8Fields[3] - name = bed8Fields[0]+":"+bed8Fields[1]+"-"+bed8Fields[2] + " (" - ref = firstExtra[0] - alt = firstExtra[1] - name += ref[:10]+"..." if len(ref) > 13 else ref - name += "/" - name += alt[:10]+"..." if len(alt) > 13 else alt - name += ")" - unshiftedStart = unshiftLeftPad(int(bed8Fields[1]), ref, alt) - pLoFCuration = getLofCuration(lofDict, version, bed8Fields[0], str(unshiftedStart), ref, alt) - savedPreFields = bed8Fields[:3] + [name] + bed8Fields[4:] + [color] + firstExtra - outfh.write("\t".join(savedPreFields + [rsId] + [gene] + [annot] + \ - consList + hgvscList + hgvspList + pLoFList + pLoFFlags + pLoFCuration + savedPostFields + [str(unshiftedStart)]) + "\n") + outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, + thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, + rsId, gene, annot, *consList, str(unshiftedStart), displayName])) + if extraFh: + extraFh.write("\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) + extraFh.write("\n") + else: + outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) + outfh.write("\n") + else: + pLoFCuration = getLofCuration(lofDict, version, bed8Fields[0], + str(unshiftedStart), ref, alt) + outfh.write("\t".join([chrom, chromStart, chromEnd, name, score, strand, + thickStart, thickEnd, color, ref, alt, filterTag, ac, an, af, faf95, nhomalt, + rsId, gene, annot, *consList, displayName])) + outfh.write("\t" + "\t".join(pLoFCuration, [str(unshiftedStart)])) + if extraFh: + extraFh.write("\t".join(name + hgvscList + hgvspList + maybeExtraFields)) + extraFh.write("\n") + else: + outfh.write("\t" + "\t".join([name] + hgvscList + hgvspList + maybeExtraFields)) + outfh.write("\n") def parseLofFile(fpath): """Make a struct of the different loss of function flags for a curated variant.""" gotHeader = False lofHeader = [] ret = {} with open(fpath) as fh: for line in fh: if not gotHeader: lofHeader = line.strip().split("\t") gotHeader = True else: lofDetails = line.strip().split("\t") ret[lofDetails[0]] = {lofHeader[x]: lofDetails[x] for x in range(len(lofHeader))} return ret def main(): args = parseCommandLine() lofDict = {} lofFile = args.lofFilePath + extraFh = None if lofFile: lofDict = parseLofFile(lofFile) if args.infile == "stdin": infh = sys.stdin else: infh = open(args.infile) if args.outfile == "stdout": outfh = sys.stdout else: outfh = open(args.outfile, "w") - gnomadVcfBedToBigBed(infh, outfh, args.version, lofDict) + if args.extra_output_file: + if args.extra_output_file == "stdout": + extraFh = sys.stdout + else: + extraFh = open(args.extra_output_file, "w") + gnomadVcfBedToBigBed(infh, outfh, extraFh, args.version, lofDict) infh.close() outfh.close() if __name__ == "__main__": main()