0adde15fcc0c841a234ab2ea86a4a8fdbef3d49b markd Fri Dec 12 01:51:06 2025 -0800 switch recount3 intron track to color items by splice junction motif #34886 diff --git src/hg/makeDb/outside/recount3/junctionsToBed.py src/hg/makeDb/outside/recount3/junctionsToBed.py index f9c3c321d32..f7a0a049e2b 100755 --- src/hg/makeDb/outside/recount3/junctionsToBed.py +++ src/hg/makeDb/outside/recount3/junctionsToBed.py @@ -1,127 +1,105 @@ #!/usr/bin/env python3 +###################################################################### +# DO NOT CODE REVIEW, SINGLE USE ONLY, CHECK recount3 TRACKS INSTEAD # +###################################################################### + import sys -import csv -import os -import re -import csv import argparse import textwrap -from collections import namedtuple from math import log -#sys.path.append('/hive/groups/browser/pycbio/lib') -from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit +import pipettor +sys.path.append('/hive/groups/browser/pycbio/lib') +from pycbio.hgdata.bed import Bed from pycbio.sys import fileOps - +from pycbio.sys.svgcolors import SvgColors def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, - description=textwrap.dedent('''\ + description=textwrap.dedent(''' Converts gff to bed format, adding expression values and CDS from separate files. Also outputs a decorator bed file, see https://genome.ucsc.edu/goldenPath/help/decorator.html - ''')) - #group = parser.add_argument_group('required arguments') parser.add_argument('--junctions', required=True, type=str, help='annotated gff') parser.add_argument('--bed', required=True, type=str, help='output bed') - parser.add_argument('--decorator', required=True, type=str, help='output decorator bed') parser.add_argument('--compilation', required=True, type=str, choices=['ccle', 'gtexv2', 'srav3h', 'tcgav2', 'srav1m'], help='one of ccle, gtexv2, srav3h, tcgav2, srav1m') args = parser.parse_args() - junction_to_bed(args.bed, args.junctions, args.decorator, args.compilation) + junction_to_bed(args.bed, args.junctions, args.compilation) +# base color with min/max HSV value +BASE_COLORS = { + ('GT', 'AG'): (SvgColors.deepskyblue, 0.70, 1.00), # U2 common + ('GC', 'AG'): (SvgColors.turquoise, 0.70, 1.00), # U2 rare + ('AT', 'AC'): (SvgColors.orange, 0.70, 1.00), # U12 +} +UNKNOWN_BASE_COLOR = (SvgColors.darkgrey, 0.25, 0.85) -def makeSpliceDecorator(bedObj, donor, acc, outFH): - '''Create decorators for splice consensus''' - #chr1 1999 2000 green_circle 0 + 1999 2000 0,255,0,255 1 1 0 chr1:1000-2000:feature glyph 0,255,0,255 Circle - #colorblind color scheme Paul Tol's Muted from - # https://www.nceas.ucsb.edu/sites/default/files/2022-06/Colorblind%20Safe%20Color%20Schemes.pdf - # dark blue = GT donor, AG acceptor (046,037,133), teal= GC donor (093,168,153), faded red = AG/AC (194,106,119) - if bedObj.strand == '.': - # do not decorate - return - # assign colors - splice_map_donor = {'GT': '046,037,133','GC': '093,168,153','AT': '194,106,119'} - splice_map_acc = {'AG': '046,037,133', 'AC': '194,106,119'} - leftColor = splice_map_donor.get(donor, 'default') - rightColor = splice_map_acc.get(acc, 'default') - # correct for negative strand - if bedObj.strand == '-': - leftColor, rightColor = rightColor, leftColor - # decorate the first and last two bases of each block - leftChromStart = bedObj.chromStart - leftChromEnd = leftChromStart + 2 - leftBlocks = [BedBlock(bedObj.chromStart, bedObj.chromStart+2)] - rightChromStart = bedObj.chromEnd - 2 - rightChromEnd = bedObj.chromEnd - rightBlocks = [BedBlock(bedObj.chromEnd-2, bedObj.chromEnd)] - # format the target location (required for format) - target='{}:{}-{}:{}'.format(bedObj.chrom, bedObj.chromStart, bedObj.chromEnd, bedObj.name) - extraCols = [target, 'block', leftColor, 'Circle'] - # create and write bed objects, one for each side - leftObj = Bed(bedObj.chrom, leftChromStart, leftChromEnd, name='decl'+bedObj.name, - strand=bedObj.strand, blocks=leftBlocks, numStdCols=12, itemRgb=leftColor, - extraCols=extraCols) - leftObj.write(outFH) - extraCols = [target, 'block', rightColor, 'Circle'] - rightObj = Bed(bedObj.chrom, rightChromStart, rightChromEnd, name='decr'+bedObj.name, - strand=bedObj.strand, blocks=rightBlocks, numStdCols=12, itemRgb=rightColor, - extraCols=extraCols) - rightObj.write(outFH) +# estimated from look at output from running this program +MAX_SCORE = 200 +def item_color(donor, acc, score): + base_color, value_min, value_max = BASE_COLORS.get((donor, acc), UNKNOWN_BASE_COLOR) + norm_score = score / MAX_SCORE + value = value_max - (norm_score * (value_max - value_min)) + return base_color.setValue(value) -def junction_to_bed(outputfile, junctions, decoratorfile, compilation): - '''Parses intron lines, extracts ID info from last field, assigns score''' +def process_rec(compilation, line, outfile): urlbase = 'link to snaptron' -#'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=&jid=&coords=' -# url = '{text}' - decFile = open(decoratorfile, 'wt') - with open(outputfile, 'wt') as outfile: - writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep) - # extract all exons from the gff, keep exons grouped by transcript - maxScore=1 - infh = fileOps.opengz(junctions, 'rt') - for line in infh: - if line.startswith('#'): - continue [snaptron_id, chrom, start, end, length, strand, annotated, left_motif, right_motif, left_annotated, right_annotated, samples, samples_count, coverage_sum, coverage_avg, coverage_median, source_dataset_id] = line.rstrip().split('\t') # skip non-genome chromosomes if any(chrom.startswith(prefix) for prefix in ('ERCC', 'SIRV', 'GL', 'JH', 'chrEBV')): - continue + return 0 # turn left and right motifs into donor and acceptor donor = left_motif.upper() acc = right_motif.upper() if strand == '?': strand = '.' elif strand == '-': complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} donor = ''.join(complement[base] for base in reversed(acc)) acc = ''.join(complement[base] for base in reversed(left_motif.upper())) # link out to sample information for each junction link = urlbase + urlfill.format(jid=snaptron_id, chrom=chrom, start=start, end=end) # correct coordinates start = int(start) - 1 end = int(end) # score scaling score = int(10 * log(int(coverage_sum))) # score field only accepts integers - maxScore = max(score, maxScore) size = end - start + color = item_color(donor, acc, score).toRgb8Str() extraCols = [size, coverage_sum, samples_count, donor, acc, link, donor + '/' + acc] # create bed object bedObj = Bed(chrom, start, end, name=snaptron_id, strand=strand, score=score, - thickStart=start, thickEnd=end, numStdCols=9, extraCols=extraCols) - # create splice site decorator file - makeSpliceDecorator(bedObj, donor, acc, decFile) + thickStart=start, thickEnd=end, numStdCols=9, itemRgb=color, + extraCols=extraCols) bedObj.write(outfile) - decFile.close() + return score + +def junction_to_bed(outputfile, junctions, compilation): + '''Parses intron lines, extracts ID info from last field, assigns score''' + + # 'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=&jid=&coords=' + # url = '{text}' + + bed_sort_cmd = ('sort', '-k1,1', '-k2,2n') + with pipettor.Popen(bed_sort_cmd, 'wt', stdout=outputfile) as outfile: + # extract all exons from the gff, keep exons grouped by transcript + maxScore = 0 + infh = fileOps.opengz(junctions, 'rt') + for line in infh: + if line.startswith('#'): + continue + score = process_rec(compilation, line, outfile) + maxScore = max(maxScore, score) print('maxScore', maxScore) if __name__ == "__main__": main()