fb75f30bb5d7fc58874d8765f209d13934c96276 max Fri May 6 02:59:50 2022 -0700 fixing up HMC track, refs #29153, spent waaaay to much time with this track diff --git src/hg/makeDb/scripts/hmc/hmcToWig.py src/hg/makeDb/scripts/hmc/hmcToWig.py index b6b3c8c..ddf74a4 100644 --- src/hg/makeDb/scripts/hmc/hmcToWig.py +++ src/hg/makeDb/scripts/hmc/hmcToWig.py @@ -1,87 +1,64 @@ import subprocess import sys def readPerChrom(db): " yield all values at consecutive positions as a tuple (chrom, pos, list of (nucl, phred value)) " # chr hg19_pos hg38_pos ref alt pfam_id NP_id gene_symbol HGVSc HGVSp hmc_score values = [] firstChrom = None firstPos = None lastChrom = None lastPos = None - #for line in gzip.open("whole_genome_SNVs.tsv.gz"): currChrom = None - chromData = {"A":{}, "C":{}, "T":{}, "G":{}} + chromData = {} isHg19 = (db=="hg19") for line in subprocess.Popen(['zcat', "hmc_wst_uniquevar_outputforshare_v2.txt.gz"], stdout=subprocess.PIPE, encoding="ascii").stdout: - #print(line) if line.startswith("chr\t"): continue row = line.rstrip("\n").split("\t") # chr hg19_pos hg38_pos ref alt pfam_id NP_id gene_symbol HGVSc HGVSp hmc_score #chrom, hg19Pos, hg38Pos, ref, alt, pfamId, npId, sym, hgvsc, hgvsp, score = row chrom = row[0] alt = row[4] if currChrom is None: currChrom = chrom if chrom != currChrom: - #print("change of chrom: old %s, new %s" % (currChrom, chrom)) yield currChrom, chromData - chromData = {"A":{}, "C":{}, "T":{}, "G":{}} + chromData = {} currChrom = chrom - # XX - #chrom = None - #break - if isHg19: pos = int(row[1]) else: pos = row[2] if pos =='': continue pos = int(pos) score = float(row[-1]) - maxScore = score + + # this is only necessary for hg38 - lifting leads to duplicates if pos in chromData: maxScore = max(chromData[pos], score) + if maxScore != score: print("duplicate value at position, using max: ", pos, score, maxScore) - #print("adding %f at %d in %s" % (maxScore, pos, chrom)) - chromData[alt][pos] = maxScore + chromData[pos] = maxScore if chrom is not None: yield chrom, chromData -#def iterStretches(nuclData): def main(): - #j." yield chrom, pos, nuclValues from nuclData which is " - #j.pos = - outFhs = { - "A" : open("a.wig", "w"), - "C" : open("c.wig", "w"), - "T" : open("t.wig", "w"), - "G" : open("g.wig", "w") - } + ofh = open("hmc.wig", "w") db = sys.argv[1] for chrom, chromData in readPerChrom(db): - #print("read %s" % chrom) - for nucl, nuclData in chromData.items(): - #print("nucleotide: %s" % (nucl)) - #print("to array and sort %s" % chrom) - nuclData = list(nuclData.items()) - nuclData.sort() - - ofh = outFhs[nucl] - #print("write %s" % chrom) - for pos, val in nuclData: + for pos, val in chromData.items(): ofh.write("%s\t%d\t%d\t%f\n" % (chrom, pos-1, pos, val)) - print("wrote wigs") + print("wrote hmc.wig") main()