src/utils/bigHeat 06d7be056190c14b85e71bc12523f18ea6815b5e

06d7be056190c14b85e71bc12523f18ea6815b5e
markd
  Mon Dec 7 00:50:29 2020 -0800
BLAT mmap index support merge with master

diff --git src/utils/bigHeat src/utils/bigHeat
new file mode 100755
index 0000000..beb6a0b
--- /dev/null
+++ src/utils/bigHeat
@@ -0,0 +1,460 @@
+#!/usr/bin/env python
+
+import logging, sys, optparse, subprocess, os, math, re
+from collections import defaultdict
+from os.path import *
+
+# made with: import cm from matplotlib; (cm.get_cmap("viridis")(x)*255)[:, 0:3].astype(int).tolist()
+viridisPal = [[68, 1, 84], [71, 18, 101], [72, 35, 116], [69, 52, 127], [64, 67, 135], [58, 82, 139], [52, 94, 141], [46, 107, 142], [41, 120, 142], [36, 132, 141], [32, 144, 140], [30, 155, 137], [34, 167, 132], [47, 179, 123], [68, 190, 112], [94, 201, 97], [121, 209, 81], [154, 216, 60], [189, 222, 38], [223, 227, 24], [253, 231, 36]]
+
+pal = None
+binCount = 20
+
+# can be set to something like '/gbdb/wuhCor1/pbm/IgG_Z-score-Healthy_Controls/' if your symlinks are in place
+bbDir = None
+
+# ==== functions =====
+def parseArgs():
+    " setup logging, parse command line arguments and options. -h shows auto-generated help page "
+    parser = optparse.OptionParser("""usage: %prog [options] locationBed locationMatrixFnames chromSizes outDir - create one feature
+            Duplicate BED features and color by them by the values in locationMatrix.
+            Creates new bigBed files in outDir and creates a basic trackDb.ra file there.
+
+    BED file looks like this:
+
+            chr1 1 1000 myGene 0 + 1 1000 0,0,0
+            chr2 1 1000 myGene2 0 + 1 1000 0,0,0
+
+    locationMatrix looks like this:
+
+            gene sample1 sample2 sample3
+            myGene 1 2 3
+            myGene2 0.1 3 10
+            myGene2_probe2 0.1 3 10
+
+    This will create a composite with three subtracks (sample1, sample2, sample). Each subtrack will have myGene,
+    and colored in intensity with sample3 more intense than sample1 and sample2. Same for myGene2.
+    Also can add a bigWig with a summary of all these values, one per nucleotide
+    """)
+
+    #parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    #(options, args) = parser.parse_args()
+
+    parser.add_option("", "--scale", dest="scale", action="store", help="scale the values to 0-1.0 automatically. One of the values 'none' or 'row'. Default is 'none'. If 'none', the options --min/--max/--mult can be used to ajust the scaling manually", default='none')
+    parser.add_option("", "--min", dest="min", action="store", type="float", help="minimum limit when transforming values to colors. Can be used to remove the distorting effect of outliers.")
+    parser.add_option("", "--max", dest="max", action="store", type="float", help="maximum limit when transforming values to colors")
+    parser.add_option("", "--mult", dest="mult", action="store", help="Two comma-separated values multNeg,multPos. If the number is negative, multiNeg is used, otherwise multPos is used. This multiplies all numbers with these factors before doing anything else.")
+    parser.add_option("", "--palRange", dest="palRange", action="store", help="Two colon-separated float numbers 0-1.0. by default, the color palette is used from its minimum 0.0 to its maximum 1.0. By setting this value, you can use only some part of the color palette, e.g. '0.1:0.9'")
+    parser.add_option("", "--log", dest="doLog", action="store_true", help="Log all values in the matrix. Negative values will be set to 1 and will also trigger a warning message.")
+    parser.add_option("", "--order", dest="doOrder", action="store_true", help="Reorder the rows with the optimalLeaf algorithm, like a real heatmap")
+    parser.add_option("", "--del", dest="delSamples", action="append", help="Remove these sample. A regex. Can be specified multiple times. Useful to remove outliers.")
+    parser.add_option("", "--cmap", dest="cmap", action="store", help="name of colormap. Requires matplotlib to be installed if set to something else than viridis.", default="viridis")
+    parser.add_option("-b", "--bbDir", dest="bbDir", action="store", help="when writing trackDb, prefix all bigDataUrl filenames with this. Use this with the /gbdb directory.")
+    parser.add_option("", "--bw", dest="bigWig", action="store", help="Also create a bigWig that summarizes all values per nucleotide. Can be one of: 'sum', 'respFreq'")
+
+    parser.add_option("-d", "--debug", dest="debug", action="store_true", help="show debug messages")
+    (options, args) = parser.parse_args()
+
+    if args==[]:
+        parser.print_help()
+        exit(1)
+
+    if options.debug:
+        logging.basicConfig(level=logging.DEBUG)
+        logging.getLogger().setLevel(logging.DEBUG)
+    else:
+        logging.basicConfig(level=logging.INFO)
+        logging.getLogger().setLevel(logging.INFO)
+
+    return args, options
+
+def parseBed(bedFname):
+    beds = {}
+    for line in open(bedFname):
+        row = line.rstrip("\n").split("\t")
+        assert(len(row)>=9)
+        row = row[:9]
+        beds[row[3]] = row
+    return beds
+
+def toBigBed(bedFnames, sampleOrder, chromSizes):
+    " convert to bigBed "
+    bbFnames = {}
+    for sampleName in sampleOrder:
+        fname = bedFnames[sampleName]
+        bbFname = splitext(fname)[0]+".bb"
+        bbFnames[sampleName] = bbFname
+        print("Converting %s to %s" % (fname, bbFname))
+        cmd = ["bedSort", fname, fname]
+        assert(subprocess.call(cmd)==0)
+        cmd = ["bedToBigBed", fname, chromSizes, bbFname, "-tab"]
+        assert(subprocess.call(cmd)==0)
+        os.remove(fname)
+    return bbFnames
+
+def makeTrackDb(bbFnames, sampleOrder, outDir, parentName):
+    " write a trackDb file to outDir "
+    global bbDir
+    tdbFn = join(outDir, "trackDb.ra")
+    tdbFh = open(tdbFn, "w")
+
+    label = parentName.replace("_", " ")
+    tdbFh.write("track %s\n" % parentName)
+    tdbFh.write("shortLabel %s\n" % label)
+    tdbFh.write("longLabel %s\n" % label)
+    tdbFh.write("type bed 9\n")
+    tdbFh.write("compositeTrack on\n")
+    tdbFh.write("visibility dense\n")
+    tdbFh.write("labelOnFeature on\n")
+    tdbFh.write("itemRgb on\n")
+    tdbFh.write("\n")
+
+    for prio, sampleName in enumerate(sampleOrder):
+        bbFname = bbFnames[sampleName]
+        trackName = splitext(basename(bbFname))[0]
+        trackLabel = trackName.replace("_", " ")
+        fullTrackName = parentName+"_"+trackName # avoid subtrack name clashes
+        if bbDir:
+            bbPath = join(bbDir, basename(bbFname))
+        else:
+            bbPath = bbFname
+        tdbFh.write("    track %s\n" % fullTrackName)
+        tdbFh.write("    shortLabel %s\n" % trackLabel)
+        tdbFh.write("    longLabel %s\n" % trackLabel)
+        tdbFh.write("    type bigBed 9\n")
+        tdbFh.write("    priority %d\n" % prio)
+        tdbFh.write("    bigDataUrl %s\n" % bbPath)
+        tdbFh.write("    parent %s on\n" % parentName)
+        tdbFh.write("\n")
+
+    if bbDir:
+        bwPath = join(bbDir, "allSum.bw")
+    else:
+        bwPath = bbFname
+
+    tdbFh.write("track %sAllSum\n" % parentName)
+    tdbFh.write("shortLabel %s Summary\n" % label)
+    tdbFh.write("longLabel %s Sum of all scores per nucleotide\n" % label)
+    tdbFh.write("type bigWig\n")
+    tdbFh.write("visibility full\n")
+    tdbFh.write("autoScale on\n")
+    tdbFh.write("maxHeightPixels 100:28:8\n")
+    tdbFh.write("bigDataUrl %s\n" % bwPath)
+    tdbFh.write("\n")
+
+    tdbFh.close()
+
+def optimalLeaf(fname):
+    " return optimal order of the columns in fname (tsv)"
+    logging.info("Reordering leaves")
+    import pandas as pd
+    from scipy.cluster import hierarchy
+    t = pd.read_csv(fname, sep="\t", index_col=0)
+    mat = t.values
+    mat = mat.transpose()
+    Z = hierarchy.ward(mat)
+    Z = hierarchy.ward(mat)
+    order = hierarchy.leaves_list(hierarchy.optimal_leaf_ordering(Z, mat))
+    #labels = t.index
+    labels = t.columns
+    assert(len(labels)==len(order))
+    labelsOrdered = [labels[i] for i in list(order)]
+    return labelsOrdered
+
+def parseChromSizes(fname):
+    chromSizes = {}
+    for line in open(fname):
+        chrom, size = line.rstrip().split()
+        size = int(size)
+        chromSizes[chrom] = size
+    return chromSizes
+
+def makeBigWig(beds, nameValSum, chromSizeFname, bwFname):
+    " write a bigWig with the value for each bed to bwFname "
+    chromSizes = parseChromSizes(chromSizeFname)
+
+    nuclScores = {}
+    for chrom, size in chromSizes.items():
+        nuclScores[chrom] = [0.0]*size
+
+    for name, val in nameValSum.items():
+        if name not in beds:
+            name = name.split("_")[0]
+            if name not in beds:
+                logging.warning("Cannot map sample name %s to genome, not found in bed file." % name)
+                continue
+
+        chrom, start, end = beds[name][:3]
+        chromScores = nuclScores[chrom]
+
+        for i in range(int(start), int(end)):
+            chromScores[i] += val
+
+    wigFname = bwFname+".wig"
+    ofh = open(wigFname, "w")
+    for chrom, chromScores in nuclScores.items():
+        ofh.write("variableStep  chrom=%s\n" % chrom)
+        for pos, score in enumerate(chromScores):
+            if score!=0.0:
+                ofh.write("%d %f\n" % (pos, score))
+    ofh.close()
+    print("Created %s" % wigFname)
+
+    cmd = ["wigToBigWig", wigFname, chromSizeFname, bwFname]
+    subprocess.check_call(cmd)
+    print("Created %s" % bwFname)
+
+def calcRespFreq(sampleNames, vals):
+    " response frequency as defined in https://www.nature.com/articles/s41423-020-00523-5, supplemental methods "
+    import numpy as np
+    assert(len(sampleNames)==len(vals))
+    # sort values into control and disease
+    ctrlVals = []
+    disVals = []
+    for sampleName, val in zip(sampleNames, vals):
+        if "Ctrl" in sampleName:
+            ctrlVals.append(val)
+        else:
+            disVals.append(val)
+
+    mean = np.mean(vals)
+    stdev = np.std(ctrlVals)
+    cutoff = mean+(3*stdev)
+    logging.info("ctrl: %s" % ctrlVals)
+    logging.info("vals: %s" % disVals)
+    disHigher = [x for x in disVals if x > cutoff]
+    logging.info("higher: %s" % repr(disHigher))
+    respFreq = float(len(disHigher)) / len(disVals)
+    logging.info("Mean=%f, Stdev=%f, Cutoff=%f, gtThanCutoff=%d, totalCount=%d, resp frequency: %f" % (mean, stdev, cutoff, len(disHigher), len(disVals), respFreq))
+    assert(respFreq < 1.0)
+    return respFreq
+
+def bigHeat(beds, fname, chromSizes, scale, minVal, maxVal, mult, doLog, doOrder, delSamples, sumFunc, outDir):
+    " create bed files, convert them to bigBeds and make trackDb.ra in the directory "
+    global bbDir
+    global pal
+
+    if pal is None:
+        pal = viridisPal
+        binCount = 10
+
+    colorCount = len(pal)
+    colSteps = 1.0/colorCount
+    palStrs = [ ",".join([str(x) for x in col]) for col in pal]
+
+    multNeg, multPos = None, None
+    if mult:
+        x, y = mult.split(",")
+        multNeg = float(x)
+        multPos = float(y)
+
+    baseIn = basename(fname).split(".")[0]
+    if not isdir(outDir):
+        os.makedirs(outDir)
+
+    print("Processing %s to output directory %s" % (fname, outDir))
+    ofhs = {}
+    data = []
+
+    useGlobal = False
+    globalMin, globalMax = 9999999999, -999999999
+
+    sampleNames = None
+    logFailCount = 0
+    for line in open(fname):
+        row = line.rstrip("\n").split("\t")
+
+        if sampleNames is None:
+            sampleNames = row[1:]
+            # create output file handles
+            for sn in sampleNames:
+                ofn = join(outDir, sn+".bed")
+                ofhs[sn] = open(ofn, "w")
+            continue
+        if row[0].startswith("Seq.") or "protein" in row[0] or row[0]=="":
+            continue
+
+        name = row[0]
+        values = row[1:]
+        assert(len(values)==len(sampleNames))
+        nums = [float(x) for x in values]
+
+        data.append( (name, nums) )
+
+        if scale=="none":
+            globalMin = min(globalMin, *nums)
+            globalMax = max(globalMax, *nums)
+            useGlobal = True
+
+    logging.warning("Count not take log of negative numbers, used 0 instead, in %d cases" % logFailCount)
+    if minVal is not None:
+        globalMin = minVal
+    if maxVal is not None:
+        globalMax = maxVal
+
+    if useGlobal or minVal or maxVal:
+        globalSpan = globalMax-globalMin
+        print("Global min/max/span values:", globalMin, globalMax, globalSpan)
+
+    nameVals = defaultdict(list)
+
+    for name, nums in data:
+        # calculate the summary on the raw values
+        # store the sum of all values
+        if name in nameVals:
+            logging.warning("%s already seen before. Will use the mean of all values. Check the input file." % name)
+        if sumFunc=="sum":
+            sumVal = sum(nums)
+        elif sumFunc=="respFreq":
+            sumVal = calcRespFreq(sampleNames, nums)
+            assert(sumVal < 1.0)
+        nameVals[name].append(sumVal)
+
+        # first take the log
+        if doLog:
+            newNums = []
+            for x in nums:
+                if x <= 0:
+                    logFailCount += 1
+                    x = 0
+                else:
+                    x = math.log2(x)
+                newNums.append(x)
+            nums = newNums
+
+        # then scale by user-provided factors
+        if multNeg:
+            newNums = []
+            for x in nums:
+                if x < 0:
+                    x = x*multNeg
+                else:
+                    x = x*multPos
+                newNums.append(x)
+            nums = newNums
+
+
+
+        if useGlobal:
+            minX = globalMin
+            maxX = globalMax
+            span = globalSpan
+            # limit the range of values to strictly globalMin - globalMax
+            nums = [min(max(globalMin, x), globalMax) for x in nums]
+        else:
+            minX = min(nums)
+            maxX = max(nums)
+            span = maxX - minX
+
+        # then bin the values and get their colors
+        logging.debug("%s - numerical values: %s"% (name, nums))
+        ratios = [(x - minX)/span for x in nums]
+        logging.debug("%s - scaled values: %s"% (name, ratios))
+        ratios = [min(0.999999, x) for x in ratios] # make sure that 1.0 doesn't appear
+        binIdxList = [int(float(x) / colSteps) for x in ratios]
+        logging.debug("%s - bin indices: %s"% (name, binIdxList))
+        colors = [palStrs[x] for x in binIdxList]
+
+        if name in beds:
+            bed = beds[name]
+        else:
+            newName = name.split("_")[0]
+            if not newName in beds:
+                logging.error("No entry in BED file for sample name %s" % repr(name))
+                continue
+            else:
+                bed = beds[newName]
+
+        for sampleName, rgbCol in zip(sampleNames, colors):
+            bed[8] = rgbCol
+            ofh = ofhs[sampleName]
+            ofh.write("\t".join(bed))
+            ofh.write("\n")
+
+    # close the bed files
+    bedFnames = {}
+    for sampleName, ofh in ofhs.items():
+        bedFnames[sampleName] = ofh.name
+        ofh.close()
+
+    # reorder sample names with optimal leaf
+    sampleOrder = sampleNames
+    if doOrder:
+        sampleOrder = optimalLeaf(fname)
+        print(sampleOrder)
+        print(sampleNames)
+        assert(set(sampleOrder)==set(sampleNames))
+
+    # remove a few samples if user wants that
+    if delSamples and len(delSamples)>0:
+        logging.info("Filtering: %d sample names" % len(sampleOrder))
+        ros = []
+        for reStr in delSamples:
+            ro = re.compile(reStr)
+            ros.append(ro)
+
+        newSampleNames = []
+        for sn in sampleOrder:
+            skipThis = False
+            for ro in ros:
+                if ro.match(sn) is not None:
+                    logging.info("Removed sample %s due to --del" % sn)
+                    skipThis = True
+                    break
+            if not skipThis:
+                newSampleNames.append(sn)
+        sampleOrder = newSampleNames
+        logging.info("After filtering: %d sample names" % len(sampleOrder))
+
+    bwFname = join(outDir, "allSum.bw")
+
+    if sumFunc is not None:
+        nameValSum = {}
+        for name, vals in nameVals.items():
+            nameValSum[name] = sum(vals)/float(len(vals))
+        makeBigWig(beds, nameValSum, chromSizes, bwFname)
+
+    bbFnames = toBigBed(bedFnames, sampleOrder, chromSizes)
+
+    makeTrackDb(bbFnames, sampleOrder, outDir, baseIn)
+
+# ----------- main --------------
+def main():
+    global bbDir
+    global pal
+    args, options = parseArgs()
+
+    bedFname = args[0]
+    matrixFname = args[1]
+    chromSizes = args[2]
+    outDir = args[3]
+
+    bbDir = options.bbDir
+    scale = options.scale
+    cmap  = options.cmap
+    minVal = options.min
+    maxVal = options.max
+    mult = options.mult
+    palRange = options.palRange
+    doLog = options.doLog
+    doOrder = options.doOrder
+    delSamples = options.delSamples
+    sumFunc = options.bigWig
+
+    if cmap!="viridis":
+        from matplotlib import cm
+        import numpy as np
+        if palRange:
+            palStart, palEnd = palRange.split(":")
+            palStart = float(palStart)
+            palEnd = float(palEnd)
+        else:
+            palStart = 0.0
+            palEnd = 1.0
+        x = np.linspace(palStart, palEnd, binCount+1) # [0.0, 0.1, 0.2 ... 1.0]
+        pal = (cm.get_cmap(cmap)(x)*255)[:, 0:3].astype(int).tolist()
+
+    beds = parseBed(bedFname)
+    bigHeat(beds, matrixFname, chromSizes, scale, minVal, maxVal, mult, doLog, doOrder, delSamples, sumFunc, outDir)
+
+main()