8e2ce67692e814db90b2d2938915712ff9b0ee51 jeltje.van.baren Wed Apr 16 18:07:03 2025 -0700 Adding recount3 #34886 diff --git src/hg/makeDb/outside/recount3/junctionsToBed.py src/hg/makeDb/outside/recount3/junctionsToBed.py new file mode 100755 index 00000000000..0600aa0cbb9 --- /dev/null +++ src/hg/makeDb/outside/recount3/junctionsToBed.py @@ -0,0 +1,125 @@ +#!/usr/bin/env python3 +import sys +import csv +import os +import re +import csv +import argparse +import textwrap +from collections import namedtuple +from math import log +sys.path.append('/hive/groups/browser/pycbio/lib') +from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit + + + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent('''\ +Converts gff to bed format, adding expression values and CDS from separate files. +Also outputs a decorator bed file, see https://genome.ucsc.edu/goldenPath/help/decorator.html + + ''')) + #group = parser.add_argument_group('required arguments') + parser.add_argument('--junctions', required=True, type=str, help='annotated gff') + parser.add_argument('--bed', required=True, type=str, help='output bed') + parser.add_argument('--decorator', required=True, type=str, help='output decorator bed') + parser.add_argument('--compilation', required=True, type=str, + choices=['ccle', 'gtexv2', 'srav3h', 'tcgav2', 'srav1m'], + help='one of ccle, gtexv2, srav3h, tcgav2, srav1m') + args = parser.parse_args() + + junction_to_bed(args.bed, args.junctions, args.decorator, args.compilation) + + +def makeSpliceDecorator(bedObj, donor, acc, outFH): + '''Create decorators for splice consensus''' + #chr1 1999 2000 green_circle 0 + 1999 2000 0,255,0,255 1 1 0 chr1:1000-2000:feature glyph 0,255,0,255 Circle + #colorblind color scheme Paul Tol's Muted from + # https://www.nceas.ucsb.edu/sites/default/files/2022-06/Colorblind%20Safe%20Color%20Schemes.pdf + # dark blue = GT donor, AG acceptor (046,037,133), teal= GC donor (093,168,153), faded red = AG/AC (194,106,119) + if bedObj.strand == '.': + # do not decorate + return + # assign colors + splice_map_donor = {'GT': '046,037,133','GC': '093,168,153','AT': '194,106,119'} + splice_map_acc = {'AG': '046,037,133', 'AC': '194,106,119'} + leftColor = splice_map_donor.get(donor, 'default') + rightColor = splice_map_acc.get(acc, 'default') + # correct for negative strand + if bedObj.strand == '-': + leftColor, rightColor = rightColor, leftColor + # decorate the first and last two bases of each block + leftChromStart = bedObj.chromStart + leftChromEnd = leftChromStart + 2 + leftBlocks = [BedBlock(bedObj.chromStart, bedObj.chromStart+2)] + rightChromStart = bedObj.chromEnd - 2 + rightChromEnd = bedObj.chromEnd + rightBlocks = [BedBlock(bedObj.chromEnd-2, bedObj.chromEnd)] + # format the target location (required for format) + target='{}:{}-{}:{}'.format(bedObj.chrom, bedObj.chromStart, bedObj.chromEnd, bedObj.name) + extraCols = [target, 'block', leftColor, 'Circle'] + # create and write bed objects, one for each side + leftObj = Bed(bedObj.chrom, leftChromStart, leftChromEnd, name='decl'+bedObj.name, + strand=bedObj.strand, blocks=leftBlocks, numStdCols=12, itemRgb=leftColor, + extraCols=extraCols) + leftObj.write(outFH) + extraCols = [target, 'block', rightColor, 'Circle'] + rightObj = Bed(bedObj.chrom, rightChromStart, rightChromEnd, name='decr'+bedObj.name, + strand=bedObj.strand, blocks=rightBlocks, numStdCols=12, itemRgb=rightColor, + extraCols=extraCols) + rightObj.write(outFH) + + +def junction_to_bed(outputfile, junctions, decoratorfile, compilation): + '''Parses intron lines, extracts ID info from last field, assigns score''' + urlbase = 'link to snaptron' +#'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=&jid=&coords=' +# url = '{text}' + decFile = open(decoratorfile, 'wt') + with open(outputfile, 'wt') as outfile: + writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep) + # extract all exons from the gff, keep exons grouped by transcript + maxScore=1 + for line in open(junctions, 'rt'): + if line.startswith('#'): + continue + [snaptron_id, chrom, start, end, length, strand, annotated, + left_motif, right_motif, left_annotated, right_annotated, samples, + samples_count, coverage_sum, coverage_avg, coverage_median, + source_dataset_id] = line.rstrip().split('\t') + # skip non-genome chromosomes + if any(chrom.startswith(prefix) for prefix in ('ERCC', 'SIRV', 'GL', 'JH', 'chrEBV')): + continue + # turn left and right motifs into donor and acceptor + donor = left_motif.upper() + acc = right_motif.upper() + if strand == '?': + strand = '.' + elif strand == '-': + complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} + donor = ''.join(complement[base] for base in reversed(acc)) + acc = ''.join(complement[base] for base in reversed(left_motif.upper())) + # link out to sample information for each junction + link = urlbase + urlfill.format(jid=snaptron_id, chrom=chrom, start=start, end=end) + # correct coordinates + start = int(start)-1 + end = int(end) + # score scaling + score = int(10*log(int(coverage_sum))) # score field only accepts integers + maxScore = max(score, maxScore) + extraCols = [coverage_sum, samples_count, donor, acc, link] + # create bed object + bedObj = Bed(chrom, start, end, name=snaptron_id, strand=strand, score=score, + thickStart=start, thickEnd=end, numStdCols=9, extraCols=extraCols) + # create splice site decorator file + makeSpliceDecorator(bedObj, donor, acc, decFile) + bedObj.write(outfile) + decFile.close() + print('maxScore', maxScore) + + +if __name__ == "__main__": + main()