src/hg/utils/aaToGenomic.py de2c47dedae1ebeb49de836beddb9bc6b4ca0ce0

de2c47dedae1ebeb49de836beddb9bc6b4ca0ce0
chmalee
  Thu Jan 16 09:20:22 2020 -0800
Moving gnomad scripts to their own directory under makeDb after Max email

diff --git src/hg/utils/aaToGenomic.py src/hg/utils/aaToGenomic.py
deleted file mode 100755
index 734efc4..0000000
--- src/hg/utils/aaToGenomic.py
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-Given a 1-based amino acid range, and a bed12 of transcripts,
-convert the amino acid range to a genomic range with exon blocks.
-
-This program is currently specialized to the gnomad missense constraint
-track, but can be more generalized if the score conversion part is
-commented out.
-
-Example:
-#transcript gene chrom range genomic_start genomic_end obs_mis exp_mis obs_exp chiseq_null_diff region
-ENST00000337907.3   RERE    1   1-507   8716356 8424825 97  197.9807    0.489947    51.505535   RERE_1
-
-chr1    8424824 8716356 ENST00000337907.3   0.49    -   8424824 8716356 224,165,8   13  74,163,81,99,100,125,49,105,97,106,126,71,325   0,1047,57962,101160,130298,132640,143861,176448,191709,192652,249795,259544,291207
-
-Here the itemRgb column corresponds to the color scheme used at the decipher browser.
-"""
-
-import sys
-import argparse
-from collections import defaultdict,namedtuple
-from scipy.stats import chi2
-
-def commandLine():
-    parser = argparse.ArgumentParser(
-        description="Converts amino acids ranges to genomic coordinates, writes to stdout. One of transcripts or aaRanges can be stdin",
-        prefix_chars="-", usage="%(prog)s [options]", add_help=True)
-    parser.add_argument("transcripts", action="store", help="bed12 of transcripts, use '-' for stdin")
-    parser.add_argument("aaRanges", action="store", help="aa range file, use '-' for stdin")
-    parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Turn verbose logging on, writes to stderr")
-
-    args = parser.parse_args()
-    if args.transcripts == "-" and args.aaRanges == "-":
-        sys.stderr.write("ERROR: Only one of transcripts and aaRanges can be stdin")
-        sys.exit(1)
-
-    return args
-
-# struct for keeping track of transcripts and their exon coordinates and sizes
-# use a defaultdict of list so PAR regions can have the different transcripts
-txDict = defaultdict(list)
-
-bedFields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand",
-"thickStart", "thickEnd", "itemRgb", "exonCount", "exonSizes", "exonStarts"]
-# autoSql map of the output bed for a conversion to bigBed later:
-bedInfo = namedtuple('bedFields',  bedFields)
-
-def log(msg):
-    """Write message to stderr instead of stdout."""
-    sys.stderr.write(msg + "\n")
-
-def logBedRecord(ret):
-    sys.stderr.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ret.chrom,
-        ret.chromStart, ret.chromEnd, ret.name, ret.score, ret.strand, ret.thickStart,
-        ret.thickEnd, ret.itemRgb, ret.exonCount, ",".join([str(x) for x in ret.exonSizes]),
-        ",".join([str(x) for x in ret.exonStarts])))
-
-def printBedRecord(ret):
-    print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (ret.chrom, ret.thickStart,
-        ret.thickEnd, ret.name, ret.score, ret.strand, ret.thickStart, ret.thickEnd, ret.itemRgb,
-        ret.exonCount, ",".join([str(x) for x in ret.exonSizes]),
-        ",".join([str(x) for x in ret.exonStarts])))
-
-def bedToStr(ret):
-    """Return the bedFields object as a string"""
-    return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (ret.chrom,
-        ret.chromStart, ret.chromEnd, ret.name, ret.score, ret.strand, ret.thickStart,
-        ret.thickEnd, ret.itemRgb, ret.exonCount, ",".join([str(x) for x in ret.exonSizes]),
-        ",".join([str(x) for x in ret.exonStarts]))
-
-def sanityCheckBed12(ret, original=None):
-    """Check that a bed12 follows the usual bed rules. Exit if a bad record"""
-    def logOriginal():
-        # helper function for logging the input bed line
-        if original:
-            log("original bed")
-            logBedRecord(original)
-
-    if len(ret.exonSizes) != int(ret.exonCount):
-        logOriginal()
-        log("ERROR: len(exonSizes) != exonCount")
-        logBedRecord(ret)
-        sys.exit(1)
-    if len(ret.exonStarts) != int(ret.exonCount):
-        logOriginal()
-        log("ERROR: len(exonStarts) != exonCount")
-        logBedRecord(ret)
-        sys.exit(1)
-
-    try:
-        for i in range(ret.exonCount):
-            # check ascending order of blocks
-            if i > 0:
-                if int(ret.exonStarts[i]) <= (int(ret.exonStarts[i-1]) + int(ret.exonSizes[i-1])):
-                    logOriginal()
-                    log("ERROR: exon blocks must be in ascending order. block %d and %d overlap." %
-                        (i-1, i))
-                    logBedRecord(ret)
-                    sys.exit(1)
-            # check for no zero-length blocks
-            # zero length blocks are ok in general but not for this bed
-            if (ret.exonStarts[i] < 0 or ret.exonSizes[i] <= 0 or
-                    int(ret.chromStart) + int(ret.exonStarts[i]) >= int(ret.chromEnd)):
-                logOriginal()
-                log("ERROR: invalid exonStart or exonSize")
-                logBedRecord(ret)
-                sys.exit(1)
-        # check that blocks span chromStart to chromEnd
-        if (ret.chromStart + ret.exonSizes[-1] + ret.exonStarts[-1] != ret.chromEnd):
-            logOriginal()
-            log("ERROR: chromStart + exonSizes[-1] + exonStarts[-1] != chromEnd")
-            log("%d + %d + %d (%d) != %d" % (ret.chromStart, ret.exonSizes[-1], ret.exonStarts[-1],
-                ret.chromStart + ret.exonStarts[-1] + ret.exonSizes[-1], ret.chromEnd))
-            logBedRecord(ret)
-            sys.exit(1)
-    except IndexError:
-        logOriginal()
-        log("ERROR: no exonSizes or exonStarts")
-        logBedRecord(ret)
-        sys.exit(1)
-
-def checkAaLenToBlockLen(ret, original, cdnaEndPos, cdnaStartPos, aaStart, aaEnd):
-    """Extra check for this bed file that the total bases covered by exons equals the length
-        of the cdna."""
-    if (sum(ret.exonSizes) != cdnaEndPos - cdnaStartPos):
-        log("ERROR: sum(blockSizes) != cdnaLen: %d != %d for %s:%d-%d" %
-            (sum(ret.exonSizes), cdnaEndPos - cdnaStartPos, ret.name, aaStart,aaEnd))
-        log("original bed:")
-        logBedRecord(original)
-        logBedRecord(ret)
-        sys.exit(1)
-
-def lineToBed(fields, lineNum=None):
-    """Transform bed12 line to namedtuple bedFields."""
-    assert(len(fields) == 12)
-    chrom = fields[0]
-    chromStart = int(fields[1])
-    chromEnd = int(fields[2])
-    name = fields[3]
-    score = fields[4]
-    strand = fields[5]
-    thickStart = int(fields[6])
-    thickEnd = int(fields[7])
-    itemRgb = fields[8]
-    exonCount = int(fields[9])
-    exonSizes = [int(x) for x in fields[10].strip(',').split(",")]
-    exonStarts = [int(x) for x in fields[11].strip(',').split(",")]
-    ret = bedInfo(chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd,
-        itemRgb, exonCount, exonSizes, exonStarts)
-    sanityCheckBed12(ret)
-    return ret
-
-def findColor(score, chi2val):
-    """Return the right color for this region based on the missense score."""
-    # this is decipher's color scheme
-    if chi2.sf(chi2val,1) > 0.001:
-        return "160,160,160"
-    if score < 0.2:
-        return "253,0,2"
-    elif score >= 0.2 and score < 0.4:
-        return "233,127,5"
-    elif score >= 0.4 and score < 0.6:
-        return "224,165,8"
-    elif score >= 0.6 and score < 0.8:
-        return "127,233,58"
-    elif score >= 0.8:
-        return "0,244,153"
-
-def txToDict(txFh, verbose=False):
-    """Make a dict of all the transcripts for lookup later."""
-    lineCount = 1
-    for line in txFh:
-        splitLine = line.strip().split('\t')
-        bed = lineToBed(splitLine, lineCount)
-        txDict[splitLine[3]].append(bed)
-        lineCount += 1
-
-def aaToBedNegStrand(aaStart, aaEnd, score, transcript, lineNum):
-    """The negative strand conversion is slightly different than the more intuitive positive
-        strand conversion. Mostly the difference is because you have to walk over the exons
-        backwards and swap chromStart and ends. So cdnaStartPos and cdnaEndPos have the
-        opposite meaning, cdnaStartPos corresponds to a bed's chromEnd, etc."""
-    cdnaStartPos = (aaStart - 1) * 3 # 0-based start
-    cdnaEndPos = (aaEnd * 3) # 0-based half open end
-    chromStart = None
-    chromEnd = None
-    cdnaStartExon = 0
-    cdnaEndExon = 0
-    length = 0
-    outStarts = []
-    outSizes = []
-    for i in range(transcript.exonCount-1,-1,-1):
-        exonStartPos = transcript.chromStart + transcript.exonStarts[i]
-        exonEndPos = exonStartPos + transcript.exonSizes[i]
-        if chromEnd is None and cdnaStartPos < sum(transcript.exonSizes[i:]):
-            cdnaStartExon = i
-            # subtract a little bit from the right side of the exon block
-            # if you were looking at the exon in the browser. the sum
-            # takes care of when we start in say the 4th exon out of 12 or something
-            chromEnd = transcript.chromStart + transcript.exonStarts[i] + (sum(transcript.exonSizes[i:]) - cdnaStartPos)
-            outStarts = transcript.exonStarts[:i+1]
-            outSizes = transcript.exonSizes[:i]
-            # the size of this exon is the old size minus how many bases into this exon
-            # the cndaStartPos is
-            outSizes += [transcript.exonSizes[i] - (exonEndPos - chromEnd)]
-        if chromStart is None and cdnaEndPos <= sum(transcript.exonSizes[i:]):
-            cdnaEndExon = i
-            chromStart = transcript.chromStart + transcript.exonStarts[i] + (sum(transcript.exonSizes[i:]) - cdnaEndPos)
-            # adjust all the chromStarts according to our new choice
-            outStarts = [0] + [x - (chromStart - transcript.chromStart) for x in outStarts[i+1:]]
-            if cdnaEndExon != cdnaStartExon:
-                outSizes = [transcript.exonSizes[i] - (chromStart - exonStartPos)] + outSizes[i+1:]
-            else:
-                outSizes = [chromEnd - chromStart] + outSizes[i+1:]
-
-    pscore = score * 1000
-    if pscore > 1000:
-        pscore = 1000
-    ret = bedInfo(transcript.chrom, chromStart, chromEnd, transcript.name, "%d" % int(pscore),
-        transcript.strand, chromStart, chromEnd, "255,0,0", cdnaStartExon-cdnaEndExon+1,
-        outSizes, outStarts)
-    checkAaLenToBlockLen(ret, transcript, cdnaEndPos, cdnaStartPos, aaStart, aaEnd)
-    sanityCheckBed12(ret)
-    return ret
-
-def aaToBedPosStrand(aaStart, aaEnd, score, transcript, lineNum):
-    """Convert an amino acid range like 1:50 to the corresponding genomic position, taking
-        into account exon blocks that define the coding sequence."""
-    cdnaStartPos = (aaStart - 1) * 3 # 0-based start
-    cdnaEndPos = (aaEnd * 3) # 0-based half open end
-    chromStart = None
-    chromEnd = None
-    cdnaStartExon = 0
-    cdnaEndExon = 0
-    length = 0
-    outStarts = []
-    outSizes = []
-    for i in range(transcript.exonCount):
-        exonStartPos = transcript.chromStart + transcript.exonStarts[i]
-        exonEndPos = exonStartPos + transcript.exonSizes[i]
-        if chromStart is None and cdnaStartPos < sum(transcript.exonSizes[:i+1]):
-            cdnaStartExon = i
-            chromStart = exonStartPos + (cdnaStartPos - sum(transcript.exonSizes[:i]))
-            outStarts.append(0)
-            outStarts += [x - (chromStart - transcript.chromStart) for x in transcript.exonStarts[i+1:]]
-            outSizes = [transcript.exonSizes[i] - (chromStart - exonStartPos)]
-            outSizes += [x for x in transcript.exonSizes[i+1:]]
-        if chromEnd is None and cdnaEndPos <= sum(transcript.exonSizes[:i+1]):
-            cdnaEndExon = i
-            chromEnd = exonStartPos + (cdnaEndPos - sum(transcript.exonSizes[:i]))
-            outSizes = outSizes[:cdnaEndExon-cdnaStartExon]
-            outSizes += [chromEnd - (chromStart + outStarts[cdnaEndExon-cdnaStartExon])]
-            outStarts = outStarts[:cdnaEndExon-cdnaStartExon+1]
-
-    pscore = score * 1000
-    if pscore > 1000:
-        pscore = 1000
-    ret = bedInfo(transcript.chrom, chromStart, chromEnd, transcript.name, "%d" % int(pscore),
-        transcript.strand, chromStart, chromEnd, "255,0,0", cdnaEndExon-cdnaStartExon+1,
-        outSizes, outStarts)
-    checkAaLenToBlockLen(ret, transcript, cdnaEndPos, cdnaStartPos, aaStart, aaEnd)
-    sanityCheckBed12(ret, transcript)
-    return ret
-
-def aaToBedByStrand(aaStart, aaEnd, score, observed, expected, chi2, gene, transcript, lineNum):
-    """Dispatch to write function depending on strand."""
-    ret = []
-    if transcript.strand == "+":
-        ret = aaToBedPosStrand(aaStart, aaEnd, score, transcript, lineNum)
-    else:
-        ret = aaToBedNegStrand(aaStart, aaEnd, score, transcript, lineNum)
-    # now add the extra scoring information
-    color = findColor(score, chi2)
-    ret = ret._replace(itemRgb=color)
-    print("%s\t%s\t%d\t%.3f\t%.3f\t%.3f" % (bedToStr(ret), gene, observed, expected, score, chi2))
-
-def aaToBed(aaFh, verbose):
-    """
-    Using the transcript dict, get the coordinates for an amino acid range in bed12 format.
-    Example input line:
-    ENST00000337907.3   RERE    1   1-507   8716356 8424825 97  197.9807    0.489947    51.505535   RERE_1
-    Example output line:
-    chr1    8424824 8716356 ENST00000337907.3   0.49    -   8424824 8716356 224,165,8   13  74,163,81,99,100,125,49,105,97,106,126,71,325   0,1047,57962,101160,130298,132640,143861,176448,191709,192652,249795,259544,291207 RERE 97 197.98 0.48 51.50
-    Example output line:
-    """
-    lineCount = 1
-    for line in aaFh:
-        if line.startswith('transcript'):
-            continue
-        splitLine = line.split('\t')
-        tx = splitLine[0]
-        gene = splitLine[1]
-        chrom = splitLine[2]
-        aaRange = splitLine[3]
-        observed = int(splitLine[6])
-        expected = float(splitLine[7])
-        score = float(splitLine[8])
-        chi2Val = float(splitLine[9])
-        try:
-            bedInfo = txDict[tx]
-            aaSeq = aaRange.split('-')
-            aaStart = int(aaSeq[0])
-            aaEnd = int(aaSeq[1])
-            for bed in bedInfo: # PAR region transcripts can have more than one
-                aaToBedByStrand(aaStart, aaEnd, score, observed, expected, chi2Val, gene, bed, lineCount)
-        except ValueError:
-            if verbose:
-                log("ERROR: transcript '%s' not found in transcripts file, line %d of input aa file" %
-                    (tx,lineCount))
-        lineCount += 1
-
-def main():
-    """open up the input bed12 and the input aa strings and call the converter."""
-    args = commandLine()
-    verbose = args.verbose
-    txFname = args.transcripts
-    aaFname = args.aaRanges
-
-    if txFname is not "-":
-        with open(txFname) as f:
-            txToDict(f)
-    else:
-        txToDict(sys.stdin)
-    if verbose:
-        log("%d transcripts added to transcript dict" % len(txDict))
-
-    if aaFname is not "-":
-        with open(aaFname) as f:
-            aaToBed(f, verbose)
-    else:
-        aaToBed(sys.stdin, verbose)
-
-if __name__ == "__main__":
-    main()