de2c47dedae1ebeb49de836beddb9bc6b4ca0ce0 chmalee Thu Jan 16 09:20:22 2020 -0800 Moving gnomad scripts to their own directory under makeDb after Max email diff --git src/hg/makeDb/gnomad/aaToGenomic.py src/hg/makeDb/gnomad/aaToGenomic.py new file mode 100755 index 0000000..734efc4 --- /dev/null +++ src/hg/makeDb/gnomad/aaToGenomic.py @@ -0,0 +1,335 @@ +#!/usr/bin/env python3 + +""" +Given a 1-based amino acid range, and a bed12 of transcripts, +convert the amino acid range to a genomic range with exon blocks. + +This program is currently specialized to the gnomad missense constraint +track, but can be more generalized if the score conversion part is +commented out. + +Example: +#transcript gene chrom range genomic_start genomic_end obs_mis exp_mis obs_exp chiseq_null_diff region +ENST00000337907.3 RERE 1 1-507 8716356 8424825 97 197.9807 0.489947 51.505535 RERE_1 + +chr1 8424824 8716356 ENST00000337907.3 0.49 - 8424824 8716356 224,165,8 13 74,163,81,99,100,125,49,105,97,106,126,71,325 0,1047,57962,101160,130298,132640,143861,176448,191709,192652,249795,259544,291207 + +Here the itemRgb column corresponds to the color scheme used at the decipher browser. +""" + +import sys +import argparse +from collections import defaultdict,namedtuple +from scipy.stats import chi2 + +def commandLine(): + parser = argparse.ArgumentParser( + description="Converts amino acids ranges to genomic coordinates, writes to stdout. One of transcripts or aaRanges can be stdin", + prefix_chars="-", usage="%(prog)s [options]", add_help=True) + parser.add_argument("transcripts", action="store", help="bed12 of transcripts, use '-' for stdin") + parser.add_argument("aaRanges", action="store", help="aa range file, use '-' for stdin") + parser.add_argument("-v", "--verbose", action="store_true", default=False, help="Turn verbose logging on, writes to stderr") + + args = parser.parse_args() + if args.transcripts == "-" and args.aaRanges == "-": + sys.stderr.write("ERROR: Only one of transcripts and aaRanges can be stdin") + sys.exit(1) + + return args + +# struct for keeping track of transcripts and their exon coordinates and sizes +# use a defaultdict of list so PAR regions can have the different transcripts +txDict = defaultdict(list) + +bedFields = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", +"thickStart", "thickEnd", "itemRgb", "exonCount", "exonSizes", "exonStarts"] +# autoSql map of the output bed for a conversion to bigBed later: +bedInfo = namedtuple('bedFields', bedFields) + +def log(msg): + """Write message to stderr instead of stdout.""" + sys.stderr.write(msg + "\n") + +def logBedRecord(ret): + sys.stderr.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (ret.chrom, + ret.chromStart, ret.chromEnd, ret.name, ret.score, ret.strand, ret.thickStart, + ret.thickEnd, ret.itemRgb, ret.exonCount, ",".join([str(x) for x in ret.exonSizes]), + ",".join([str(x) for x in ret.exonStarts]))) + +def printBedRecord(ret): + print("%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (ret.chrom, ret.thickStart, + ret.thickEnd, ret.name, ret.score, ret.strand, ret.thickStart, ret.thickEnd, ret.itemRgb, + ret.exonCount, ",".join([str(x) for x in ret.exonSizes]), + ",".join([str(x) for x in ret.exonStarts]))) + +def bedToStr(ret): + """Return the bedFields object as a string""" + return "%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (ret.chrom, + ret.chromStart, ret.chromEnd, ret.name, ret.score, ret.strand, ret.thickStart, + ret.thickEnd, ret.itemRgb, ret.exonCount, ",".join([str(x) for x in ret.exonSizes]), + ",".join([str(x) for x in ret.exonStarts])) + +def sanityCheckBed12(ret, original=None): + """Check that a bed12 follows the usual bed rules. Exit if a bad record""" + def logOriginal(): + # helper function for logging the input bed line + if original: + log("original bed") + logBedRecord(original) + + if len(ret.exonSizes) != int(ret.exonCount): + logOriginal() + log("ERROR: len(exonSizes) != exonCount") + logBedRecord(ret) + sys.exit(1) + if len(ret.exonStarts) != int(ret.exonCount): + logOriginal() + log("ERROR: len(exonStarts) != exonCount") + logBedRecord(ret) + sys.exit(1) + + try: + for i in range(ret.exonCount): + # check ascending order of blocks + if i > 0: + if int(ret.exonStarts[i]) <= (int(ret.exonStarts[i-1]) + int(ret.exonSizes[i-1])): + logOriginal() + log("ERROR: exon blocks must be in ascending order. block %d and %d overlap." % + (i-1, i)) + logBedRecord(ret) + sys.exit(1) + # check for no zero-length blocks + # zero length blocks are ok in general but not for this bed + if (ret.exonStarts[i] < 0 or ret.exonSizes[i] <= 0 or + int(ret.chromStart) + int(ret.exonStarts[i]) >= int(ret.chromEnd)): + logOriginal() + log("ERROR: invalid exonStart or exonSize") + logBedRecord(ret) + sys.exit(1) + # check that blocks span chromStart to chromEnd + if (ret.chromStart + ret.exonSizes[-1] + ret.exonStarts[-1] != ret.chromEnd): + logOriginal() + log("ERROR: chromStart + exonSizes[-1] + exonStarts[-1] != chromEnd") + log("%d + %d + %d (%d) != %d" % (ret.chromStart, ret.exonSizes[-1], ret.exonStarts[-1], + ret.chromStart + ret.exonStarts[-1] + ret.exonSizes[-1], ret.chromEnd)) + logBedRecord(ret) + sys.exit(1) + except IndexError: + logOriginal() + log("ERROR: no exonSizes or exonStarts") + logBedRecord(ret) + sys.exit(1) + +def checkAaLenToBlockLen(ret, original, cdnaEndPos, cdnaStartPos, aaStart, aaEnd): + """Extra check for this bed file that the total bases covered by exons equals the length + of the cdna.""" + if (sum(ret.exonSizes) != cdnaEndPos - cdnaStartPos): + log("ERROR: sum(blockSizes) != cdnaLen: %d != %d for %s:%d-%d" % + (sum(ret.exonSizes), cdnaEndPos - cdnaStartPos, ret.name, aaStart,aaEnd)) + log("original bed:") + logBedRecord(original) + logBedRecord(ret) + sys.exit(1) + +def lineToBed(fields, lineNum=None): + """Transform bed12 line to namedtuple bedFields.""" + assert(len(fields) == 12) + chrom = fields[0] + chromStart = int(fields[1]) + chromEnd = int(fields[2]) + name = fields[3] + score = fields[4] + strand = fields[5] + thickStart = int(fields[6]) + thickEnd = int(fields[7]) + itemRgb = fields[8] + exonCount = int(fields[9]) + exonSizes = [int(x) for x in fields[10].strip(',').split(",")] + exonStarts = [int(x) for x in fields[11].strip(',').split(",")] + ret = bedInfo(chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, + itemRgb, exonCount, exonSizes, exonStarts) + sanityCheckBed12(ret) + return ret + +def findColor(score, chi2val): + """Return the right color for this region based on the missense score.""" + # this is decipher's color scheme + if chi2.sf(chi2val,1) > 0.001: + return "160,160,160" + if score < 0.2: + return "253,0,2" + elif score >= 0.2 and score < 0.4: + return "233,127,5" + elif score >= 0.4 and score < 0.6: + return "224,165,8" + elif score >= 0.6 and score < 0.8: + return "127,233,58" + elif score >= 0.8: + return "0,244,153" + +def txToDict(txFh, verbose=False): + """Make a dict of all the transcripts for lookup later.""" + lineCount = 1 + for line in txFh: + splitLine = line.strip().split('\t') + bed = lineToBed(splitLine, lineCount) + txDict[splitLine[3]].append(bed) + lineCount += 1 + +def aaToBedNegStrand(aaStart, aaEnd, score, transcript, lineNum): + """The negative strand conversion is slightly different than the more intuitive positive + strand conversion. Mostly the difference is because you have to walk over the exons + backwards and swap chromStart and ends. So cdnaStartPos and cdnaEndPos have the + opposite meaning, cdnaStartPos corresponds to a bed's chromEnd, etc.""" + cdnaStartPos = (aaStart - 1) * 3 # 0-based start + cdnaEndPos = (aaEnd * 3) # 0-based half open end + chromStart = None + chromEnd = None + cdnaStartExon = 0 + cdnaEndExon = 0 + length = 0 + outStarts = [] + outSizes = [] + for i in range(transcript.exonCount-1,-1,-1): + exonStartPos = transcript.chromStart + transcript.exonStarts[i] + exonEndPos = exonStartPos + transcript.exonSizes[i] + if chromEnd is None and cdnaStartPos < sum(transcript.exonSizes[i:]): + cdnaStartExon = i + # subtract a little bit from the right side of the exon block + # if you were looking at the exon in the browser. the sum + # takes care of when we start in say the 4th exon out of 12 or something + chromEnd = transcript.chromStart + transcript.exonStarts[i] + (sum(transcript.exonSizes[i:]) - cdnaStartPos) + outStarts = transcript.exonStarts[:i+1] + outSizes = transcript.exonSizes[:i] + # the size of this exon is the old size minus how many bases into this exon + # the cndaStartPos is + outSizes += [transcript.exonSizes[i] - (exonEndPos - chromEnd)] + if chromStart is None and cdnaEndPos <= sum(transcript.exonSizes[i:]): + cdnaEndExon = i + chromStart = transcript.chromStart + transcript.exonStarts[i] + (sum(transcript.exonSizes[i:]) - cdnaEndPos) + # adjust all the chromStarts according to our new choice + outStarts = [0] + [x - (chromStart - transcript.chromStart) for x in outStarts[i+1:]] + if cdnaEndExon != cdnaStartExon: + outSizes = [transcript.exonSizes[i] - (chromStart - exonStartPos)] + outSizes[i+1:] + else: + outSizes = [chromEnd - chromStart] + outSizes[i+1:] + + pscore = score * 1000 + if pscore > 1000: + pscore = 1000 + ret = bedInfo(transcript.chrom, chromStart, chromEnd, transcript.name, "%d" % int(pscore), + transcript.strand, chromStart, chromEnd, "255,0,0", cdnaStartExon-cdnaEndExon+1, + outSizes, outStarts) + checkAaLenToBlockLen(ret, transcript, cdnaEndPos, cdnaStartPos, aaStart, aaEnd) + sanityCheckBed12(ret) + return ret + +def aaToBedPosStrand(aaStart, aaEnd, score, transcript, lineNum): + """Convert an amino acid range like 1:50 to the corresponding genomic position, taking + into account exon blocks that define the coding sequence.""" + cdnaStartPos = (aaStart - 1) * 3 # 0-based start + cdnaEndPos = (aaEnd * 3) # 0-based half open end + chromStart = None + chromEnd = None + cdnaStartExon = 0 + cdnaEndExon = 0 + length = 0 + outStarts = [] + outSizes = [] + for i in range(transcript.exonCount): + exonStartPos = transcript.chromStart + transcript.exonStarts[i] + exonEndPos = exonStartPos + transcript.exonSizes[i] + if chromStart is None and cdnaStartPos < sum(transcript.exonSizes[:i+1]): + cdnaStartExon = i + chromStart = exonStartPos + (cdnaStartPos - sum(transcript.exonSizes[:i])) + outStarts.append(0) + outStarts += [x - (chromStart - transcript.chromStart) for x in transcript.exonStarts[i+1:]] + outSizes = [transcript.exonSizes[i] - (chromStart - exonStartPos)] + outSizes += [x for x in transcript.exonSizes[i+1:]] + if chromEnd is None and cdnaEndPos <= sum(transcript.exonSizes[:i+1]): + cdnaEndExon = i + chromEnd = exonStartPos + (cdnaEndPos - sum(transcript.exonSizes[:i])) + outSizes = outSizes[:cdnaEndExon-cdnaStartExon] + outSizes += [chromEnd - (chromStart + outStarts[cdnaEndExon-cdnaStartExon])] + outStarts = outStarts[:cdnaEndExon-cdnaStartExon+1] + + pscore = score * 1000 + if pscore > 1000: + pscore = 1000 + ret = bedInfo(transcript.chrom, chromStart, chromEnd, transcript.name, "%d" % int(pscore), + transcript.strand, chromStart, chromEnd, "255,0,0", cdnaEndExon-cdnaStartExon+1, + outSizes, outStarts) + checkAaLenToBlockLen(ret, transcript, cdnaEndPos, cdnaStartPos, aaStart, aaEnd) + sanityCheckBed12(ret, transcript) + return ret + +def aaToBedByStrand(aaStart, aaEnd, score, observed, expected, chi2, gene, transcript, lineNum): + """Dispatch to write function depending on strand.""" + ret = [] + if transcript.strand == "+": + ret = aaToBedPosStrand(aaStart, aaEnd, score, transcript, lineNum) + else: + ret = aaToBedNegStrand(aaStart, aaEnd, score, transcript, lineNum) + # now add the extra scoring information + color = findColor(score, chi2) + ret = ret._replace(itemRgb=color) + print("%s\t%s\t%d\t%.3f\t%.3f\t%.3f" % (bedToStr(ret), gene, observed, expected, score, chi2)) + +def aaToBed(aaFh, verbose): + """ + Using the transcript dict, get the coordinates for an amino acid range in bed12 format. + Example input line: + ENST00000337907.3 RERE 1 1-507 8716356 8424825 97 197.9807 0.489947 51.505535 RERE_1 + Example output line: + chr1 8424824 8716356 ENST00000337907.3 0.49 - 8424824 8716356 224,165,8 13 74,163,81,99,100,125,49,105,97,106,126,71,325 0,1047,57962,101160,130298,132640,143861,176448,191709,192652,249795,259544,291207 RERE 97 197.98 0.48 51.50 + Example output line: + """ + lineCount = 1 + for line in aaFh: + if line.startswith('transcript'): + continue + splitLine = line.split('\t') + tx = splitLine[0] + gene = splitLine[1] + chrom = splitLine[2] + aaRange = splitLine[3] + observed = int(splitLine[6]) + expected = float(splitLine[7]) + score = float(splitLine[8]) + chi2Val = float(splitLine[9]) + try: + bedInfo = txDict[tx] + aaSeq = aaRange.split('-') + aaStart = int(aaSeq[0]) + aaEnd = int(aaSeq[1]) + for bed in bedInfo: # PAR region transcripts can have more than one + aaToBedByStrand(aaStart, aaEnd, score, observed, expected, chi2Val, gene, bed, lineCount) + except ValueError: + if verbose: + log("ERROR: transcript '%s' not found in transcripts file, line %d of input aa file" % + (tx,lineCount)) + lineCount += 1 + +def main(): + """open up the input bed12 and the input aa strings and call the converter.""" + args = commandLine() + verbose = args.verbose + txFname = args.transcripts + aaFname = args.aaRanges + + if txFname is not "-": + with open(txFname) as f: + txToDict(f) + else: + txToDict(sys.stdin) + if verbose: + log("%d transcripts added to transcript dict" % len(txDict)) + + if aaFname is not "-": + with open(aaFname) as f: + aaToBed(f, verbose) + else: + aaToBed(sys.stdin, verbose) + +if __name__ == "__main__": + main()