44a3fed1a487ef0bbfb4d7443903661d81822350 max Mon May 30 06:36:56 2022 -0700 adding makedoc for REVEL tracks, refs #29475 diff --git src/hg/makeDb/revel/revelToWig-hg19.py src/hg/makeDb/revel/revelToWig-hg19.py new file mode 100644 index 0000000..9b1fa96 --- /dev/null +++ src/hg/makeDb/revel/revelToWig-hg19.py @@ -0,0 +1,122 @@ +import subprocess, sys +from collections import defaultdict + +# based on caddToWig.py in kent/src/hg/makeDb/cadd +# two arguments: input filename and db (one of: hg19, hg38) + +def inputLineChunk(fname, db, bedFh): + " yield all values at consecutive positions as a tuple (chrom, pos, list of (nucl, phred value)) " + # chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL + values = [] + firstChrom = None + firstPos = None + lastChrom = None + lastPos = None + donePos = {} + + assert (db in ["hg19", "hg38"]) + doHg19 = (db=="hg19") + + #fname = "revel_grch38_all_chromosomes.csv.gz" + if fname.endswith(".gz"): + ifh = subprocess.Popen(['zcat', fname], stdout=subprocess.PIPE, encoding="ascii").stdout + else: + ifh = open(fname) + + for line in ifh: + if line.startswith("chr"): + continue + row = line.rstrip("\n").split(",") + + chrom, hg19Pos, hg38Pos, ref, alt, aaRef, aaAlt, revel, transId = row + if doHg19: + pos = hg19Pos + else: + pos = hg38Pos + + if not doHg19 and pos==".": + continue + + pos = int(pos) # wiggle ascii is 1-based AAARRGH!! + + # the file has duplicate values in the hg38 column, but for different hg19 positions! + if not doHg19: + if pos in donePos and hg19Pos!=donePos[pos]: + continue + hg19Pos = int(hg19Pos) + donePos[pos] = hg19Pos + + chrom = "chr"+chrom + revel = float(revel) + + + if firstPos is None: + firstChrom = chrom + firstPos = pos + + if chrom != lastChrom or pos-lastPos > 1: + yield firstChrom, firstPos, values + firstPos, firstChrom = pos, chrom + values = [] + transIds = {} + + if chrom != lastChrom: + lastPos = None + donePos = {} + + if lastPos is None or pos-lastPos > 0: + values.append([0.0,0.0,0.0,0.0]) + transIds = {} + + nuclIdx = "ACGT".find(alt) + + oldVal = values[-1][nuclIdx] + if oldVal != 0.0 and oldVal != revel: + # OK, we know that we have two values at this position for this alt allele now + # so we write the original transcriptId and the current one to the BED file + start = int(hg19Pos)-1 + bed = (chrom, start, start+1, alt, "0", ".", start, start+1, transIds[(alt, oldVal)], oldVal) + bed = [str(x) for x in bed] + bedFh.write("\t".join(bed)) + bedFh.write("\n") + bed = (chrom, start, start+1, alt, "0", ".", start, start+1, transId, revel) + bed = [str(x) for x in bed] + bedFh.write("\t".join(bed)) + bedFh.write("\n") + # and only keep the maximum in the bigWig + revel = max(oldVal, revel) + + values[-1][nuclIdx] = revel + transIds[(alt, revel)] = transId + + lastPos = pos + lastChrom = chrom + # last line of file + yield firstChrom, firstPos, values + +fname = sys.argv[1] +db = sys.argv[2] + +outFhs = { + "A" : open("a.wig", "w"), + "C" : open("c.wig", "w"), + "T" : open("t.wig", "w"), + "G" : open("g.wig", "w") + } + +bedFh = open("overlap.bed", "w") + +for chrom, pos, nuclValues in inputLineChunk(fname, db, bedFh): + if len(nuclValues)==0: + continue + + for ofh in outFhs.values(): + ofh.write("fixedStep chrom=%s span=1 step=1 start=%d\n" % (chrom, pos)) + + for nuclVals in nuclValues: + for nucIdx in (0,1,2,3): + alt = "ACGT"[nucIdx] + ofh = outFhs[alt] + revel = nuclVals[nucIdx] + ofh.write(str(revel)) + ofh.write("\n")