src/hg/makeDb/revel/revelToWig.py 9e65e54f0f956ea44785df0427f0f457fcfbb34c

9e65e54f0f956ea44785df0427f0f457fcfbb34c
max
  Fri Dec 13 05:34:09 2024 -0800
fix for hg19 revel track, refs #34947

diff --git src/hg/makeDb/revel/revelToWig.py src/hg/makeDb/revel/revelToWig.py
new file mode 100644
index 0000000..6dffeae
--- /dev/null
+++ src/hg/makeDb/revel/revelToWig.py
@@ -0,0 +1,187 @@
+import subprocess, sys
+from collections import defaultdict
+#from array import array
+import numpy as np
+import json
+
+# based on revelToWig.py in kent/src/hg/makeDb/scripts/revel/
+# two arguments: chrom.sizes and input filename 
+
+# Yes, the hg19 and hg38 scripts should be merged into one, but I didn't want to mess with the
+# hg19 one when it was done, so there are two scripts now. There are hardly ever updates to REVEL
+# so this hack may be fine for now.
+
+def parseChromSizes(chromSizesFname):
+    chromSizes = {}
+    for line in open(chromSizesFname):
+        chrom, size = line.strip().split()
+        #if chrom!="chr21":
+            #continue
+        size = int(size)
+        chromSizes[chrom] = size
+    return chromSizes
+
+def initValues(chromSizes, chrom):
+    " return a list of four float arrays with chromSize length and initialized to -1"
+    print('init arrays for chrom %s' % chrom)
+    arrs = []
+    for n in range(0, 4):
+        #a = array('f', chromSizes[chrom]*[-1])
+        a = np.full(chromSizes[chrom], -1, dtype=float)
+        arrs.append(a)
+    print('arrays done')
+    return arrs
+
+def outTransIds(chrom, pos, ref, prevTransIds, bedFh):
+    " output features of all (alt, score, transId) to bed file. We need to output all we have at this position "
+    #print(chrom, pos, ref, alt, revel, prevTransIds)
+    start = pos
+    chrom = "chr"+chrom
+    for alt, scoreTransList in prevTransIds.items():
+        tableDict = {}
+        tableLines = []
+        mouseOvers = []
+        allScores = set()
+        for revelScore, transIdStr in scoreTransList:
+            mouseOvers.append("transcript %s -> score %f" % (transIdStr, revelScore))
+            #tableLines.append(transIdStr.replace(",", ", ")+"|"+str(revelScore))
+            assert(transIdStr not in tableDict)
+            tableDict[transIdStr] = str(revelScore)
+            allScores.add(revelScore)
+
+        # ONLY output the line if we have different scores = there is no need to output a feature if all the scores are the same
+        if len(allScores)>1:
+            #bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, ";".join(tableLines), ", ".join(mouseOvers))
+            bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, json.dumps(tableDict), ", ".join(mouseOvers))
+            bed = [str(x) for x in bed]
+            bedFh.write("\t".join(bed))
+            bedFh.write("\n")
+
+def inputLineChunk(fname, chromSizes, db):
+    " read all values for one chrom and return as four arrays, -1 means 'no value'"
+    # chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL
+    values = []
+    prevTransIds = defaultdict(list)
+
+    if fname.endswith(".gz"):
+        ifh = subprocess.Popen(['zcat', fname], stdout=subprocess.PIPE, encoding="ascii").stdout
+    else:
+        ifh = open(fname)
+
+    #chromValues = initValues(chromSizes, "chr1")
+    lastChrom = None
+    prevPos = None
+    prevRef = None
+    doTrans = False
+    for line in ifh:
+        if line.startswith("chr"):
+            continue
+        row = line.rstrip("\n").split(",")
+
+        chrom, hg19Pos, hg38Pos, ref, alt, aaRef, aaAlt, revel, transId = row
+        transId = transId.replace(";", ",") # hgc outlink code uses , as separator
+        if db=="hg38":
+            pos = hg38Pos
+        else:
+            pos = hg19Pos
+
+        if pos==".":
+            continue
+
+        pos = int(pos)-1 # wiggle ascii is 1-based AAARRGH!!
+        # but I keep everythin 0 based internally
+        # the file has duplicate values in the hg38 column, but for different hg19 positions!
+        revel = float(revel)
+
+        if pos!=prevPos:
+
+            if doTrans:
+                outTransIds(lastChrom, prevPos, prevRef, prevTransIds, bedFh)
+            prevTransIds = defaultdict(list)
+            doTrans = False
+
+        if chrom != lastChrom:
+            if lastChrom!=None:
+                yield lastChrom, chromValues
+            lastChrom = chrom
+            chromValues = initValues(chromSizes, "chr"+chrom)
+            prevTransIds = defaultdict(list)
+            doTrans = False
+
+        nuclIdx = "ACGT".find(alt)
+
+        oldVal = chromValues[nuclIdx][pos]
+        if oldVal != -1 and oldVal != revel:
+            # OK, we know that we have at least two different revel values at this position for this alt allele now
+            # so we set a flag that for this position, we must output all the alts, their scores, and transIds
+            # also, we always use the worst score per position, when there are overlaps
+            doTrans = True
+            revel = max(oldVal, revel)
+
+        chromValues[nuclIdx][pos] = revel
+        prevTransIds[alt].append((revel, transId))
+        prevPos = pos
+        prevRef = ref
+        prevRevel = revel
+
+
+    # last line of file
+    if doTrans:
+        outTransIds(lastChrom, prevPos, prevRef, prevTransIds, bedFh)
+    yield chrom, chromValues
+
+chromSizesFname = sys.argv[1]
+fname = sys.argv[2]
+db = sys.argv[3]
+
+outFhs = {
+        0 : open("a.wig", "w"),
+        1 : open("c.wig", "w"),
+        2 : open("g.wig", "w"),
+        3 : open("t.wig", "w")
+        }
+
+chromSizes = parseChromSizes(chromSizesFname)
+
+bedFh = open("overlap.bed", "w")
+
+for chrom, nuclValues in inputLineChunk(fname, chromSizes, db):
+    #print(chrom, nuclValues, len(nuclValues[0]))
+    if len(nuclValues)==0:
+        continue
+
+    # need to find positions that have no values at all:
+    # so we can distinguish: "has no value" (not covered) with "is reference"
+    # (value=0)
+    arrSum = np.sum(nuclValues, axis=0)
+    # positions without any data have now arrSum[i]==-4
+
+    for nucIdx in (0,1,2,3):
+        arr = nuclValues[nucIdx]
+        ofh = outFhs[nucIdx]
+        lastVal = arr[0]
+        stretch = []
+        stretchStart = 0
+        for i in range(0, arr.size):
+            val = arr[i]
+            hasData = (arrSum[i]!=-4)
+
+            if val==-1 and hasData:
+                if len(stretch)==0:
+                    stretchStart = i
+                stretch.append(0)
+            elif val!=-1:
+                if len(stretch)==0:
+                    stretchStart = i
+                stretch.append(val)
+            else:
+                # there is no data at all at this position, so flush the block
+                if len(stretch)!=0:
+                    ofh.write("fixedStep chrom=chr%s span=1 step=1 start=%d\n" % (chrom, stretchStart+1))
+                    for revel in stretch:
+                        ofh.write(str(revel))
+                        ofh.write("\n")
+                    ofh.write("\n")
+                stretchStart = None
+                stretch = []
+