0f3ca3eaf5792df01b7c600a5428d2d0b2809fcd max Fri Sep 20 13:18:01 2024 -0700 Revert "more features to hubtools: search in both parent and subdirs, better docs" This reverts commit 05e67c59a20a5d00b810a981aef3b00c5bef82e1. diff --git src/hg/makeDb/revel/revelToWig-hg38.py src/hg/makeDb/revel/revelToWig-hg38.py index 9f576d2..733e3fb 100644 --- src/hg/makeDb/revel/revelToWig-hg38.py +++ src/hg/makeDb/revel/revelToWig-hg38.py @@ -1,180 +1,146 @@ import subprocess, sys from collections import defaultdict #from array import array import numpy as np -import json # based on revelToWig.py in kent/src/hg/makeDb/scripts/revel/ # two arguments: chrom.sizes and input filename # Yes, the hg19 and hg38 scripts should be merged into one, but I didn't want to mess with the # hg19 one when it was done, so there are two scripts now. There are hardly ever updates to REVEL # so this hack may be fine for now. def parseChromSizes(chromSizesFname): chromSizes = {} for line in open(chromSizesFname): chrom, size = line.strip().split() - #if chrom!="chr21": - #continue size = int(size) chromSizes[chrom] = size return chromSizes def initValues(chromSizes, chrom): " return a list of four float arrays with chromSize length and initialized to -1" print('init arrays for chrom %s' % chrom) arrs = [] for n in range(0, 4): #a = array('f', chromSizes[chrom]*[-1]) a = np.full(chromSizes[chrom], -1, dtype=float) arrs.append(a) print('arrays done') return arrs -def outTransIds(chrom, pos, ref, prevTransIds, bedFh): - " output features of all (alt, score, transId) to bed file. We need to output all we have at this position " - #print(chrom, pos, ref, alt, revel, prevTransIds) - start = pos - chrom = "chr"+chrom - for alt, scoreTransList in prevTransIds.items(): - tableDict = {} - tableLines = [] - mouseOvers = [] - for revelScore, transIdStr in scoreTransList: - mouseOvers.append("transcript %s -> score %f" % (transIdStr, revelScore)) - #tableLines.append(transIdStr.replace(",", ", ")+"|"+str(revelScore)) - assert(transIdStr not in tableDict) - tableDict[transIdStr] = str(revelScore) - - #bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, ";".join(tableLines), ", ".join(mouseOvers)) - bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, json.dumps(tableDict), ", ".join(mouseOvers)) - bed = [str(x) for x in bed] - bedFh.write("\t".join(bed)) - bedFh.write("\n") - -#def inputLinePos(fname, chromSizes): - def inputLineChunk(fname, chromSizes): " read all values for one chrom and return as four arrays, -1 means 'no value'" # chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL values = [] - prevTransIds = defaultdict(list) + prevTransIds = {} if fname.endswith(".gz"): ifh = subprocess.Popen(['zcat', fname], stdout=subprocess.PIPE, encoding="ascii").stdout else: ifh = open(fname) - #chromValues = initValues(chromSizes, "chr1") - lastChrom = None - prevPos = None - prevRef = None - doTrans = False + chromValues = initValues(chromSizes, "chr1") + lastChrom = "1" for line in ifh: if line.startswith("chr"): continue row = line.rstrip("\n").split(",") chrom, hg19Pos, hg38Pos, ref, alt, aaRef, aaAlt, revel, transId = row transId = transId.replace(";", ",") # hgc outlink code uses , as separator pos = hg38Pos if pos==".": continue pos = int(pos)-1 # wiggle ascii is 1-based AAARRGH!! # but I keep everythin 0 based internally # the file has duplicate values in the hg38 column, but for different hg19 positions! revel = float(revel) - if pos!=prevPos: - - if doTrans: - outTransIds(lastChrom, prevPos, prevRef, prevTransIds, bedFh) - prevTransIds = defaultdict(list) - doTrans = False - - if chrom != lastChrom: - if lastChrom!=None: + if chrom != lastChrom and lastChrom!=None: yield lastChrom, chromValues lastChrom = chrom chromValues = initValues(chromSizes, "chr"+chrom) - prevTransIds = defaultdict(list) - doTrans = False + prevTransIds = {} nuclIdx = "ACGT".find(alt) + #chromValues[nuclIdx][pos] = revel oldVal = chromValues[nuclIdx][pos] if oldVal != -1 and oldVal != revel: - # OK, we know that we have at least two different revel values at this position for this alt allele now - # so we set a flag that for this position, we must output all the alts, their scores, and transIds - # also, we always use the worst score per position, when there are overlaps - doTrans = True + # OK, we know that we have two values at this position for this alt allele now + # so we write the original transcriptId and the current one to the BED file + start = pos + chrom = "chr"+chrom + bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, prevTransIds[(alt, oldVal)], oldVal) + bed = [str(x) for x in bed] + bedFh.write("\t".join(bed)) + bedFh.write("\n") + bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, transId, revel) + bed = [str(x) for x in bed] + bedFh.write("\t".join(bed)) + bedFh.write("\n") + # and only keep the maximum in the bigWig revel = max(oldVal, revel) chromValues[nuclIdx][pos] = revel - prevTransIds[alt].append((revel, transId)) - prevPos = pos - prevRef = ref - prevRevel = revel + prevTransIds[(alt, revel)] = transId # last line of file - if doTrans: - outTransIds(lastChrom, prevPos, prevRef, prevTransIds, bedFh) yield chrom, chromValues chromSizesFname = sys.argv[1] fname = sys.argv[2] outFhs = { 0 : open("a.wig", "w"), 1 : open("c.wig", "w"), 2 : open("g.wig", "w"), 3 : open("t.wig", "w") } chromSizes = parseChromSizes(chromSizesFname) bedFh = open("overlap.bed", "w") for chrom, nuclValues in inputLineChunk(fname, chromSizes): - #print(chrom, nuclValues, len(nuclValues[0])) if len(nuclValues)==0: continue # need to find positions that have no values at all: # so we can distinguish: "has no value" (not covered) with "is reference" # (value=0) arrSum = np.sum(nuclValues, axis=0) # positions without any data have now arrSum[i]==-4 for nucIdx in (0,1,2,3): arr = nuclValues[nucIdx] ofh = outFhs[nucIdx] lastVal = arr[0] stretch = [] stretchStart = 0 for i in range(0, arr.size): val = arr[i] hasData = (arrSum[i]!=-4) if val==-1 and hasData: if len(stretch)==0: stretchStart = i stretch.append(0) elif val!=-1: if len(stretch)==0: stretchStart = i stretch.append(val) else: # there is no data at all at this position, so flush the block if len(stretch)!=0: ofh.write("fixedStep chrom=chr%s span=1 step=1 start=%d\n" % (chrom, stretchStart+1)) for revel in stretch: ofh.write(str(revel)) ofh.write("\n") ofh.write("\n") stretchStart = None stretch = []