0e016ca51069bf70ea5c9e08d7ef2fa4dd81a838 max Wed Feb 10 06:11:16 2021 -0800 adding makedoc for cadd track, refs #18492 diff --git src/hg/makeDb/cadd/caddToWig.py src/hg/makeDb/cadd/caddToWig.py new file mode 100644 index 0000000..0eb21ab --- /dev/null +++ src/hg/makeDb/cadd/caddToWig.py @@ -0,0 +1,63 @@ +import subprocess + +def inputLineChunk(): + " yield all values at consecutive positions as a tuple (chrom, pos, list of (nucl, phred value)) " + ## CADD GRCh37-v1.6 (c) University of Washington, Hudson-Alpha Institute for Biotechnology and Berlin Institute of Health 2013-2019. All rights reserved. + #Chrom Pos Ref Alt RawScore PHRED + #1 10001 T A 0.088260 4.066 + values = [] + firstChrom = None + firstPos = None + lastChrom = None + lastPos = None + + #for line in gzip.open("whole_genome_SNVs.tsv.gz"): + for line in subprocess.Popen(['zcat', "whole_genome_SNVs.tsv.gz"], stdout=subprocess.PIPE, encoding="ascii").stdout: + if line.startswith("#"): + continue + row = line.rstrip("\n").split("\t") + chrom, pos, ref, alt, raw, phred = row + chrom = "chr"+chrom + pos = int(row[1]) + phred = float(phred) + + if firstPos is None: + firstChrom = chrom + firstPos = pos + + if chrom!=lastChrom or pos-lastPos > 1 and firstChrom is not None: + yield firstChrom, firstPos, values + firstPos, firstChrom = None, None + values = [] + + if pos != lastPos: + values.append([0.0,0.0,0.0,0.0]) + nuclIdx = "ACGT".find(alt) + values[-1][nuclIdx] = phred + + lastPos = pos + lastChrom = chrom + # last line of file + yield firstChrom, firstPos, values + +outFhs = { + "A" : open("a.wig", "w"), + "C" : open("c.wig", "w"), + "T" : open("t.wig", "w"), + "G" : open("g.wig", "w") + } + +for chrom, pos, nuclValues in inputLineChunk(): + if len(nuclValues)==0: + continue + + for ofh in outFhs.values(): + ofh.write("fixedStep chrom=%s span=1 step=1 start=%d\n" % (chrom, pos)) + + for nuclVals in nuclValues: + for nucIdx in (0,1,2,3): + alt = "ACGT"[nucIdx] + ofh = outFhs[alt] + phred = nuclVals[nucIdx] + ofh.write(str(phred)) + ofh.write("\n")