ee98569fb749e16cf3e4601a7cef331432d062f8 jeltje.van.baren Tue Jan 21 10:25:19 2025 -0800 adding alphaMissense diff --git src/hg/makeDb/outside/alphaMissense/alphaMissenseToWig.py src/hg/makeDb/outside/alphaMissense/alphaMissenseToWig.py new file mode 100755 index 00000000000..0fab6e39ced --- /dev/null +++ src/hg/makeDb/outside/alphaMissense/alphaMissenseToWig.py @@ -0,0 +1,71 @@ +#!/usr/bin/env python3 +import sys +import gzip + +def inputLineChunk(infile): + " yield all values at consecutive positions as a tuple (chrom, pos, list of (nucl, am_pathogenicity)) " + #CHROM POS REF ALT genome uniprot_id transcript_id protein_variant am_pathogenicity am_class + #chr1 69094 G T hg38 Q8NH21 ENST00000335137.4 V2L 0.2937 likely_benign + values = [] # holds per nucleotide pathogenicity values + firstChrom = None + firstPos = None + lastChrom = None + lastPos = None + + with gzip.open(infile, 'rt') as f: + #with open(infile, 'r') as f: + for line in f: + if line.startswith("#"): + continue + row = line.rstrip("\n").split("\t") + chrom, pos, ref, alt, genome, uniprot_id, transcript_id, protein_variant, am_pathogenicity, am_class= row + pos = int(row[1]) + phred = float(am_pathogenicity) + + if firstPos is None: + firstChrom = chrom + firstPos = pos + + # if the block position skips one or more bases we must print a wig block and start over + if chrom!=lastChrom or pos-lastPos > 1 and firstChrom is not None: + yield firstChrom, firstPos, values + firstPos, firstChrom = pos, chrom + #firstPos, firstChrom = None, None # I think this is a bug in the cadd code + values = [] + + # start with an empty array, add numbers if we find values for each allele in subsequent lines + if pos != lastPos: + values.append([0.0,0.0,0.0,0.0]) + nuclIdx = "ACGT".find(alt) + values[-1][nuclIdx] = phred + + lastPos = pos + lastChrom = chrom + # last line of file + yield firstChrom, firstPos, values + +# open four wig files, one per nt +outFhs = { + "A" : open("a.wig", "w"), + "C" : open("c.wig", "w"), + "T" : open("t.wig", "w"), + "G" : open("g.wig", "w") + } + +if len(sys.argv) == 1: + sys.exit('Please give input file') + +for chrom, pos, nuclValues in inputLineChunk(sys.argv[1]): + if len(nuclValues)==0: + continue + # append new entry to all four wig files + for ofh in outFhs.values(): + ofh.write("fixedStep chrom=%s span=1 step=1 start=%d\n" % (chrom, pos)) + + for nuclVals in nuclValues: + for nucIdx in (0,1,2,3): + alt = "ACGT"[nucIdx] + ofh = outFhs[alt] + phred = nuclVals[nucIdx] + ofh.write(str(phred)) + ofh.write("\n")