src/hg/makeDb/outside/recount3/junctionsToBed.py 0adde15fcc0c841a234ab2ea86a4a8fdbef3d49b

0adde15fcc0c841a234ab2ea86a4a8fdbef3d49b
markd
  Fri Dec 12 01:51:06 2025 -0800
switch recount3 intron track to color items by splice junction motif #34886

diff --git src/hg/makeDb/outside/recount3/junctionsToBed.py src/hg/makeDb/outside/recount3/junctionsToBed.py
index f9c3c321d32..f7a0a049e2b 100755
--- src/hg/makeDb/outside/recount3/junctionsToBed.py
+++ src/hg/makeDb/outside/recount3/junctionsToBed.py
@@ -1,127 +1,105 @@
 #!/usr/bin/env python3
+######################################################################
+# DO NOT CODE REVIEW, SINGLE USE ONLY, CHECK recount3 TRACKS INSTEAD #
+######################################################################
+
 import sys
-import csv
-import os
-import re
-import csv
 import argparse
 import textwrap
-from collections import namedtuple
 from math import log
-#sys.path.append('/hive/groups/browser/pycbio/lib')
-from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit
+import pipettor
+sys.path.append('/hive/groups/browser/pycbio/lib')
+from pycbio.hgdata.bed import Bed
 from pycbio.sys import fileOps
-
+from pycbio.sys.svgcolors import SvgColors
 
 def main():
     parser = argparse.ArgumentParser(
         formatter_class=argparse.RawDescriptionHelpFormatter,
-	description=textwrap.dedent('''\
+        description=textwrap.dedent('''
         Converts gff to bed format, adding expression values and CDS from separate files.
         Also outputs a decorator bed file, see https://genome.ucsc.edu/goldenPath/help/decorator.html
-
         '''))
-	#group = parser.add_argument_group('required arguments')
     parser.add_argument('--junctions', required=True, type=str, help='annotated gff')
     parser.add_argument('--bed', required=True, type=str, help='output bed')
-	parser.add_argument('--decorator', required=True, type=str, help='output decorator bed')
     parser.add_argument('--compilation', required=True, type=str,
                         choices=['ccle', 'gtexv2', 'srav3h', 'tcgav2', 'srav1m'],
                         help='one of ccle, gtexv2, srav3h, tcgav2, srav1m')
     args = parser.parse_args()
 
-	junction_to_bed(args.bed, args.junctions, args.decorator, args.compilation)
+    junction_to_bed(args.bed, args.junctions, args.compilation)
 
+# base color with min/max HSV value
+BASE_COLORS = {
+    ('GT', 'AG'): (SvgColors.deepskyblue, 0.70, 1.00),  # U2 common
+    ('GC', 'AG'): (SvgColors.turquoise, 0.70, 1.00),  # U2 rare
+    ('AT', 'AC'): (SvgColors.orange, 0.70, 1.00),  # U12
+}
+UNKNOWN_BASE_COLOR = (SvgColors.darkgrey, 0.25, 0.85)
 
-def makeSpliceDecorator(bedObj, donor, acc, outFH):
-	'''Create decorators for splice consensus'''
-	#chr1 1999 2000 green_circle 0 + 1999 2000 0,255,0,255 1 1 0 chr1:1000-2000:feature glyph 0,255,0,255 Circle
-	#colorblind color scheme Paul Tol's Muted from
-	# https://www.nceas.ucsb.edu/sites/default/files/2022-06/Colorblind%20Safe%20Color%20Schemes.pdf
-	# dark blue = GT donor, AG acceptor (046,037,133), teal= GC donor (093,168,153), faded red = AG/AC (194,106,119)
-	if bedObj.strand == '.':
-		# do not decorate
-		return
-	# assign colors
-	splice_map_donor  = {'GT': '046,037,133','GC': '093,168,153','AT': '194,106,119'}
-	splice_map_acc = {'AG': '046,037,133', 'AC': '194,106,119'}
-	leftColor  = splice_map_donor.get(donor, 'default')
-	rightColor = splice_map_acc.get(acc, 'default')
-	# correct for negative strand
-	if bedObj.strand == '-':
-		leftColor, rightColor = rightColor, leftColor
-	# decorate the first and last two bases of each block
-	leftChromStart = bedObj.chromStart
-	leftChromEnd = leftChromStart + 2
-	leftBlocks = [BedBlock(bedObj.chromStart, bedObj.chromStart+2)]
-	rightChromStart = bedObj.chromEnd - 2
-	rightChromEnd = bedObj.chromEnd
-	rightBlocks = [BedBlock(bedObj.chromEnd-2, bedObj.chromEnd)]
-	# format the target location (required for format)
-	target='{}:{}-{}:{}'.format(bedObj.chrom, bedObj.chromStart, bedObj.chromEnd, bedObj.name)
-	extraCols = [target, 'block', leftColor, 'Circle']
-	# create and write bed objects, one for each side
-	leftObj = Bed(bedObj.chrom, leftChromStart, leftChromEnd, name='decl'+bedObj.name,
-				strand=bedObj.strand, blocks=leftBlocks, numStdCols=12, itemRgb=leftColor,
-				extraCols=extraCols)
-	leftObj.write(outFH)
-	extraCols = [target, 'block', rightColor, 'Circle']
-	rightObj = Bed(bedObj.chrom, rightChromStart, rightChromEnd, name='decr'+bedObj.name,
-				strand=bedObj.strand, blocks=rightBlocks, numStdCols=12, itemRgb=rightColor,
-				extraCols=extraCols)
-	rightObj.write(outFH)
+# estimated from look at output from running this program
+MAX_SCORE = 200
 
+def item_color(donor, acc, score):
+    base_color, value_min, value_max = BASE_COLORS.get((donor, acc), UNKNOWN_BASE_COLOR)
+    norm_score = score / MAX_SCORE
+    value = value_max - (norm_score * (value_max - value_min))
+    return base_color.setValue(value)
 
-def junction_to_bed(outputfile, junctions, decoratorfile, compilation):
-	'''Parses intron lines, extracts ID info from last field, assigns score'''
+def process_rec(compilation, line, outfile):
     urlbase = '<a href="https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=' + compilation
     urlfill = '&jid={jid}&coords={chrom}:{start}-{end}" target="_blank">link to snaptron</a>'
-#'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=<COMPILATION_ID>&jid=<JXN_ID>&coords=<CHROMOSOME:START-END>'
-#	url = '<a href="https://snaptron.cs.jhu.edu/gtexv2/snaptron?regions={chrom}:{start}-{end}" target="_blank">{text}</a>'
-	decFile = open(decoratorfile, 'wt')
-	with open(outputfile, 'wt') as outfile:
-		writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep)
-		# extract all exons from the gff, keep exons grouped by transcript
-		maxScore=1
-		infh = fileOps.opengz(junctions, 'rt')
-		for line in infh:
-			if line.startswith('#'):
-				continue
     [snaptron_id, chrom, start, end, length, strand, annotated,
         left_motif, right_motif, left_annotated, right_annotated, samples,
         samples_count, coverage_sum, coverage_avg, coverage_median,
         source_dataset_id] = line.rstrip().split('\t')
     # skip non-genome chromosomes
     if any(chrom.startswith(prefix) for prefix in ('ERCC', 'SIRV', 'GL', 'JH', 'chrEBV')):
-				continue
+        return 0
     # turn left and right motifs into donor and acceptor
     donor = left_motif.upper()
     acc = right_motif.upper()
     if strand == '?':
         strand = '.'
     elif strand == '-':
         complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
         donor = ''.join(complement[base] for base in reversed(acc))
         acc = ''.join(complement[base] for base in reversed(left_motif.upper()))
     # link out to sample information for each junction
     link = urlbase + urlfill.format(jid=snaptron_id, chrom=chrom, start=start, end=end)
     # correct coordinates
     start = int(start) - 1
     end = int(end)
     # score scaling
     score = int(10 * log(int(coverage_sum)))  # score field only accepts integers
-			maxScore = max(score, maxScore)
     size = end - start
+    color = item_color(donor, acc, score).toRgb8Str()
     extraCols = [size, coverage_sum, samples_count, donor, acc, link, donor + '/' + acc]
     # create bed object
     bedObj = Bed(chrom, start, end, name=snaptron_id, strand=strand, score=score,
-				thickStart=start, thickEnd=end, numStdCols=9, extraCols=extraCols)
-			# create splice site decorator file
-			makeSpliceDecorator(bedObj, donor, acc, decFile)
+                 thickStart=start, thickEnd=end, numStdCols=9, itemRgb=color,
+                 extraCols=extraCols)
     bedObj.write(outfile)
-	decFile.close()
+    return score
+
+def junction_to_bed(outputfile, junctions, compilation):
+    '''Parses intron lines, extracts ID info from last field, assigns score'''
+
+    # 'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=<COMPILATION_ID>&jid=<JXN_ID>&coords=<CHROMOSOME:START-END>'
+    #   url = '<a href="https://snaptron.cs.jhu.edu/gtexv2/snaptron?regions={chrom}:{start}-{end}" target="_blank">{text}</a>'
+
+    bed_sort_cmd = ('sort', '-k1,1', '-k2,2n')
+    with pipettor.Popen(bed_sort_cmd, 'wt', stdout=outputfile) as outfile:
+        # extract all exons from the gff, keep exons grouped by transcript
+        maxScore = 0
+        infh = fileOps.opengz(junctions, 'rt')
+        for line in infh:
+            if line.startswith('#'):
+                continue
+            score = process_rec(compilation, line, outfile)
+            maxScore = max(maxScore, score)
     print('maxScore', maxScore)
 
 
 if __name__ == "__main__":
     main()