bd24865e9b1f065a61e8069a5eb056cf91dd7fc8 max Thu May 5 09:34:46 2022 -0700 adding hmc make script, refs #29153 diff --git src/hg/makeDb/scripts/hmc/hmcToWig.py src/hg/makeDb/scripts/hmc/hmcToWig.py new file mode 100644 index 0000000..b6b3c8c --- /dev/null +++ src/hg/makeDb/scripts/hmc/hmcToWig.py @@ -0,0 +1,87 @@ +import subprocess +import sys + +def readPerChrom(db): + " yield all values at consecutive positions as a tuple (chrom, pos, list of (nucl, phred value)) " + # chr hg19_pos hg38_pos ref alt pfam_id NP_id gene_symbol HGVSc HGVSp hmc_score + values = [] + firstChrom = None + firstPos = None + lastChrom = None + lastPos = None + + #for line in gzip.open("whole_genome_SNVs.tsv.gz"): + currChrom = None + chromData = {"A":{}, "C":{}, "T":{}, "G":{}} + isHg19 = (db=="hg19") + for line in subprocess.Popen(['zcat', "hmc_wst_uniquevar_outputforshare_v2.txt.gz"], stdout=subprocess.PIPE, encoding="ascii").stdout: + #print(line) + if line.startswith("chr\t"): + continue + row = line.rstrip("\n").split("\t") + # chr hg19_pos hg38_pos ref alt pfam_id NP_id gene_symbol HGVSc HGVSp hmc_score + #chrom, hg19Pos, hg38Pos, ref, alt, pfamId, npId, sym, hgvsc, hgvsp, score = row + chrom = row[0] + alt = row[4] + if currChrom is None: + currChrom = chrom + + if chrom != currChrom: + #print("change of chrom: old %s, new %s" % (currChrom, chrom)) + yield currChrom, chromData + chromData = {"A":{}, "C":{}, "T":{}, "G":{}} + currChrom = chrom + + # XX + #chrom = None + #break + + if isHg19: + pos = int(row[1]) + else: + pos = row[2] + if pos =='': + continue + pos = int(pos) + + score = float(row[-1]) + + maxScore = score + if pos in chromData: + maxScore = max(chromData[pos], score) + print("duplicate value at position, using max: ", pos, score, maxScore) + + #print("adding %f at %d in %s" % (maxScore, pos, chrom)) + chromData[alt][pos] = maxScore + + if chrom is not None: + yield chrom, chromData + +#def iterStretches(nuclData): +def main(): + #j." yield chrom, pos, nuclValues from nuclData which is " + #j.pos = + outFhs = { + "A" : open("a.wig", "w"), + "C" : open("c.wig", "w"), + "T" : open("t.wig", "w"), + "G" : open("g.wig", "w") + } + + db = sys.argv[1] + + for chrom, chromData in readPerChrom(db): + #print("read %s" % chrom) + for nucl, nuclData in chromData.items(): + #print("nucleotide: %s" % (nucl)) + #print("to array and sort %s" % chrom) + nuclData = list(nuclData.items()) + nuclData.sort() + + ofh = outFhs[nucl] + #print("write %s" % chrom) + for pos, val in nuclData: + ofh.write("%s\t%d\t%d\t%f\n" % (chrom, pos-1, pos, val)) + + print("wrote wigs") +main()