87d5b93cb522e7bfa4a66ae41f93e94394d2e1d1
chmalee
  Fri Sep 18 14:34:02 2020 -0700
Stagin gnomad pext track, refs #25869

diff --git src/hg/makeDb/gnomad/buildPext.py src/hg/makeDb/gnomad/buildPext.py
new file mode 100755
index 0000000..5ff515c
--- /dev/null
+++ src/hg/makeDb/gnomad/buildPext.py
@@ -0,0 +1,171 @@
+#!/usr/bin/env python3
+
+"""Convert gnomAD pext values to one bedGraph per tissue"""
+
+import sys,argparse,os,gzip,math,signal
+from collections import namedtuple,defaultdict,OrderedDict
+
+defaultDir = os.getcwd() + "/output"
+
+bed5 = ["chrom", "start", "end", "gene", "score"]
+bedRange = namedtuple("bedRange", bed5)
+
+def setupCommandLine():
+    parser = argparse.ArgumentParser(description="Convert gnomAD pext values per gene into per tissue bedGraphs")
+    parser.add_argument("infile", help="Input optionally gzipped tab separated pext values, the first line of this file must contain all the different tissues. To read from stdin use 'stdin'")
+    parser.add_argument("-o", "--outdir", default=defaultDir, help="Output directory for each bedGraph, defaults to './output'")
+    if len(sys.argv) == 1:
+        parser.print_help()
+        sys.exit(1)
+    args = parser.parse_args()
+    return args
+
+def writeExonScoresToFile(tissueFhs, tissues, tissueRegionScores):
+    """Write a per tissue bedGraph line"""
+    for (ix,tissue) in enumerate(tissueRegionScores):
+        for region in tissueRegionScores[tissue]:
+            string = "%s\t%s\t%s\t%0.5f\n" % (region.chrom, region.start, region.end, tissueRegionScores[tissue][region])
+            tissueFhs[ix].write(string)
+
+def setupOutput(outdir,line):
+    """Set up the output files we will be writing to, and a signal handler for trapping ctrl-c"""
+    header = line.strip()
+    tissues = header.split('\t')[3:]
+    tissueFhs = []
+    if not outdir.endswith("/"):
+        outdir += "/"
+    for t in tissues:
+        tissueFh = open(outdir + t + ".bed", "w+")
+        tissueFhs.append(tissueFh)
+    return outdir,tissues,tissueFhs
+
+def writePreviousRangeScores(tissueRangeScores, tissueList, geneList, tIndex, tissueFhs):
+    """For each tissue in tissueList, write a new bedGraph line and delete the entry."""
+    if tIndex == -1:
+        for ix,tissue in enumerate(tissueRangeScores):
+            if geneList is None:
+                for gene in tissueRangeScores[tissue]:
+                    rangeScore = tissueRangeScores[tissue][gene]
+                    tissueFhs[ix].write("%s\t%s\t%s\t%s\t%0.5f\n" % (rangeScore.chrom,rangeScore.start,rangeScore.end, gene, rangeScore.score))
+            else:
+                for gene in geneList:
+                    rangeScore = tissueRangeScores[tissue][gene]
+                    tissueFhs[ix].write("%s\t%s\t%s\t%s\t%0.5f\n" % (rangeScore.chrom,rangeScore.start,rangeScore.end, gene, rangeScore.score))
+                    
+    else:
+        if geneList is None:
+            for gene in tissueRangeScores[tissueList[0]]:
+                rangeScore = tissueRangeScores[tissueList[0]][gene]
+                tissueFhs[tIndex].write("%s\t%s\t%s\t%s\t%0.5f\n" % (rangeScore.chrom,rangeScore.start,rangeScore.end, gene, rangeScore.score))
+        else:
+            for gene in geneList:
+                rangeScore = tissueRangeScores[tissueList[0]][gene]
+                tissueFhs[tIndex].write("%s\t%s\t%s\t%s\t%0.5f\n" % (rangeScore.chrom,rangeScore.start,rangeScore.end, gene, rangeScore.score))
+
+def addToOrStartNewRange(oldRange, tissueRangeScores, newPosition, vals, tissues, tissueFhs):
+    """For all the tissues in vals, figure out if we are still in the same exon
+            and adding to the previous range/score combo, or if we are in the same
+            exon but with a different score, or a new exon altogether.
+       Return: The new range (either an expanded oldRange or brand new range if a new exon)"""
+    newRange = oldRange
+    if not oldRange:
+        newRange = newPosition
+        for t,tissue in enumerate(tissueRangeScores):
+            tissueRangeScores[tissue][newPosition.gene] = newPosition._replace(score=float(vals[t]))
+    else:
+        if newPosition.chrom != oldRange.chrom:
+            #brand new exon/gene
+            writePreviousRangeScores(tissueRangeScores, tissues, None, -1, tissueFhs)
+            for t,tissue in enumerate(tissues):
+                newBed = newPosition._replace(score=float(vals[t]))
+                tissueRangeScores[tissue].clear()
+                tissueRangeScores[tissue][newPosition.gene] = newBed
+            newRange = newPosition
+        else:
+            if newPosition.start > oldRange.end:
+                # new exon
+                writePreviousRangeScores(tissueRangeScores,tissues, None, -1, tissueFhs)
+                for t,tissue in enumerate(tissues):
+                    newPosition = newPosition._replace(score=float(vals[t]))
+                    tissueRangeScores[tissue].clear()
+                    tissueRangeScores[tissue][newPosition.gene] = newPosition
+                newRange = newPosition
+            else:
+                # figure out whether to extend old record or maybe add a new one
+                newRange = oldRange._replace(end=newPosition.end)
+                if newPosition.gene not in oldRange.gene:
+                    # new gene that overlaps exons, disallow
+                    newRange = oldRange._replace(gene=oldRange.gene+[newPosition.gene])
+                    for t,tissue in enumerate(tissues):
+                        for gene in oldRange.gene:
+                            if gene in tissueRangeScores[tissue]:
+                                del tissueRangeScores[tissue][gene]
+                elif len(oldRange.gene) == 1:
+                    for t,tissue in enumerate(tissues):
+                        oldRange = tissueRangeScores[tissue][newPosition.gene]
+                        newScore = float(vals[t])
+                        if oldRange.score == newScore or (math.isnan(newScore) and math.isnan(oldRange.score)):
+                            temp = oldRange._replace(end=newPosition.end)
+                            tissueRangeScores[tissue][newPosition.gene] = temp
+                        else:
+                            # write and clear old for this gene, then add new
+                            writePreviousRangeScores(tissueRangeScores, [tissue], [newPosition.gene], -1, tissueFhs)
+                            tissueRangeScores[tissue][newPosition.gene] = newPosition._replace(score=newScore)
+    if type(newRange.gene) is not list:
+        newRange = newRange._replace(gene=[newRange.gene])
+    return newRange
+
+def buildPext(inFh, outdir):
+    """Find the list of tissues from the first line in the file, then make per tissue bedGraphs"""
+    currentRange = None
+    currentGene = None
+    tissues = []
+    tissueFhs = []
+    def handler(signum, frame):
+        """A helper function to close open files if a ctrl-c is sent"""
+        print("Keyboard interrupt: closing open files.")
+        for fh in tissueFhs:
+            fh.close()
+        sys.exit(1)
+
+    # most tissues will have a constant score across a majority of the exon
+    # so use this structure to compact the scores down from base level to a region level
+    tissueRegionScores = {}
+    for line in inFh:
+        if "str" not in str(type(line)):
+            line = line.decode("ASCII")
+        if line.startswith("ensg"):
+            outdir, tissues, tissueFhs = setupOutput(outdir, line)
+            signal.signal(signal.SIGINT, handler)
+            tissueRegionScores = OrderedDict({t : {} for t in tissues})
+            continue
+        fields = line.strip().split('\t')
+        ensg = fields[0]
+        pos = fields[1]
+        gene = fields[2]
+        vals = fields[3:]
+        chrom,position = pos.split(':')
+        # use ensg as gene because sometimes the same gene has mulitple ensg ids (EFNA3)
+        thisRange = bedRange._make(["chr"+chrom,int(position)-1,int(position),ensg,-1])
+        currentRange = addToOrStartNewRange(currentRange, tissueRegionScores, thisRange, vals, tissues, tissueFhs)
+    # write the final line:
+    writePreviousRangeScores(tissueRegionScores, tissueRegionScores.keys(),None,-1,tissueFhs)
+
+def main():
+    args = setupCommandLine()
+    try:
+        os.mkdir(args.outdir)
+    except FileExistsError:
+        pass
+    if args.infile != "stdin":
+        if args.infile[-4:] == ".bgz" or args.infile[-3:] == ".gz":
+            with gzip.open(args.infile, "rb") as f:
+                buildPext(f, args.outdir)
+        else:
+            with open(args.infile) as f:
+                buildPext(f, args.outdir)
+    else:
+        buildPext(sys.stdin, args.outdir)
+
+if __name__ == "__main__":
+    main()