9e65e54f0f956ea44785df0427f0f457fcfbb34c
max
  Fri Dec 13 05:34:09 2024 -0800
fix for hg19 revel track, refs #34947

diff --git src/hg/makeDb/revel/revelToWig-hg38.py src/hg/makeDb/revel/revelToWig-hg38.py
deleted file mode 100644
index 733e3fb..0000000
--- src/hg/makeDb/revel/revelToWig-hg38.py
+++ /dev/null
@@ -1,146 +0,0 @@
-import subprocess, sys
-from collections import defaultdict
-#from array import array
-import numpy as np
-
-# based on revelToWig.py in kent/src/hg/makeDb/scripts/revel/
-# two arguments: chrom.sizes and input filename 
-
-# Yes, the hg19 and hg38 scripts should be merged into one, but I didn't want to mess with the
-# hg19 one when it was done, so there are two scripts now. There are hardly ever updates to REVEL
-# so this hack may be fine for now.
-
-def parseChromSizes(chromSizesFname):
-    chromSizes = {}
-    for line in open(chromSizesFname):
-        chrom, size = line.strip().split()
-        size = int(size)
-        chromSizes[chrom] = size
-    return chromSizes
-
-def initValues(chromSizes, chrom):
-    " return a list of four float arrays with chromSize length and initialized to -1"
-    print('init arrays for chrom %s' % chrom)
-    arrs = []
-    for n in range(0, 4):
-        #a = array('f', chromSizes[chrom]*[-1])
-        a = np.full(chromSizes[chrom], -1, dtype=float)
-        arrs.append(a)
-    print('arrays done')
-    return arrs
-
-def inputLineChunk(fname, chromSizes):
-    " read all values for one chrom and return as four arrays, -1 means 'no value'"
-    # chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL
-    values = []
-    prevTransIds = {}
-
-    if fname.endswith(".gz"):
-        ifh = subprocess.Popen(['zcat', fname], stdout=subprocess.PIPE, encoding="ascii").stdout
-    else:
-        ifh = open(fname)
-
-    chromValues = initValues(chromSizes, "chr1")
-    lastChrom = "1"
-    for line in ifh:
-        if line.startswith("chr"):
-            continue
-        row = line.rstrip("\n").split(",")
-
-        chrom, hg19Pos, hg38Pos, ref, alt, aaRef, aaAlt, revel, transId = row
-        transId = transId.replace(";", ",") # hgc outlink code uses , as separator
-        pos = hg38Pos
-        if pos==".":
-            continue
-
-        pos = int(pos)-1 # wiggle ascii is 1-based AAARRGH!!
-        # but I keep everythin 0 based internally
-        # the file has duplicate values in the hg38 column, but for different hg19 positions!
-        revel = float(revel)
-
-        if chrom != lastChrom and lastChrom!=None:
-            yield lastChrom, chromValues
-            lastChrom = chrom
-            chromValues = initValues(chromSizes, "chr"+chrom)
-            prevTransIds = {}
-
-        nuclIdx = "ACGT".find(alt)
-
-        #chromValues[nuclIdx][pos] = revel
-        oldVal = chromValues[nuclIdx][pos]
-        if oldVal != -1 and oldVal != revel:
-            # OK, we know that we have two values at this position for this alt allele now
-            # so we write the original transcriptId and the current one to the BED file
-            start = pos
-            chrom = "chr"+chrom
-            bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, prevTransIds[(alt, oldVal)], oldVal)
-            bed = [str(x) for x in bed]
-            bedFh.write("\t".join(bed))
-            bedFh.write("\n")
-            bed = (chrom, start, start+1, ref+">"+alt, "0", ".", start, start+1, transId, revel)
-            bed = [str(x) for x in bed]
-            bedFh.write("\t".join(bed))
-            bedFh.write("\n")
-            # and only keep the maximum in the bigWig
-            revel = max(oldVal, revel)
-
-        chromValues[nuclIdx][pos] = revel
-        prevTransIds[(alt, revel)] = transId
-
-
-    # last line of file
-    yield chrom, chromValues
-
-chromSizesFname = sys.argv[1]
-fname = sys.argv[2]
-
-outFhs = {
-        0 : open("a.wig", "w"),
-        1 : open("c.wig", "w"),
-        2 : open("g.wig", "w"),
-        3 : open("t.wig", "w")
-        }
-
-chromSizes = parseChromSizes(chromSizesFname)
-
-bedFh = open("overlap.bed", "w")
-
-for chrom, nuclValues in inputLineChunk(fname, chromSizes):
-    if len(nuclValues)==0:
-        continue
-
-    # need to find positions that have no values at all:
-    # so we can distinguish: "has no value" (not covered) with "is reference"
-    # (value=0)
-    arrSum = np.sum(nuclValues, axis=0)
-    # positions without any data have now arrSum[i]==-4
-
-    for nucIdx in (0,1,2,3):
-        arr = nuclValues[nucIdx]
-        ofh = outFhs[nucIdx]
-        lastVal = arr[0]
-        stretch = []
-        stretchStart = 0
-        for i in range(0, arr.size):
-            val = arr[i]
-            hasData = (arrSum[i]!=-4)
-
-            if val==-1 and hasData:
-                if len(stretch)==0:
-                    stretchStart = i
-                stretch.append(0)
-            elif val!=-1:
-                if len(stretch)==0:
-                    stretchStart = i
-                stretch.append(val)
-            else:
-                # there is no data at all at this position, so flush the block
-                if len(stretch)!=0:
-                    ofh.write("fixedStep chrom=chr%s span=1 step=1 start=%d\n" % (chrom, stretchStart+1))
-                    for revel in stretch:
-                        ofh.write(str(revel))
-                        ofh.write("\n")
-                    ofh.write("\n")
-                stretchStart = None
-                stretch = []
-