003337bf766f61f102c61cb924af98ad09372ad0 markd Tue Aug 5 12:41:24 2025 -0700 added size and splice junction filters to recount3 diff --git src/hg/makeDb/outside/recount3/junctionsToBed.py src/hg/makeDb/outside/recount3/junctionsToBed.py index 0600aa0cbb9..f9c3c321d32 100755 --- src/hg/makeDb/outside/recount3/junctionsToBed.py +++ src/hg/makeDb/outside/recount3/junctionsToBed.py @@ -1,28 +1,28 @@ #!/usr/bin/env python3 import sys import csv import os import re import csv import argparse import textwrap from collections import namedtuple from math import log -sys.path.append('/hive/groups/browser/pycbio/lib') +#sys.path.append('/hive/groups/browser/pycbio/lib') from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit - +from pycbio.sys import fileOps def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ Converts gff to bed format, adding expression values and CDS from separate files. Also outputs a decorator bed file, see https://genome.ucsc.edu/goldenPath/help/decorator.html ''')) #group = parser.add_argument_group('required arguments') parser.add_argument('--junctions', required=True, type=str, help='annotated gff') parser.add_argument('--bed', required=True, type=str, help='output bed') parser.add_argument('--decorator', required=True, type=str, help='output decorator bed') parser.add_argument('--compilation', required=True, type=str, @@ -71,55 +71,57 @@ extraCols=extraCols) rightObj.write(outFH) def junction_to_bed(outputfile, junctions, decoratorfile, compilation): '''Parses intron lines, extracts ID info from last field, assigns score''' urlbase = '<a href="https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation='+compilation urlfill = '&jid={jid}&coords={chrom}:{start}-{end}" target="_blank">link to snaptron</a>' #'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=<COMPILATION_ID>&jid=<JXN_ID>&coords=<CHROMOSOME:START-END>' # url = '<a href="https://snaptron.cs.jhu.edu/gtexv2/snaptron?regions={chrom}:{start}-{end}" target="_blank">{text}</a>' decFile = open(decoratorfile, 'wt') with open(outputfile, 'wt') as outfile: writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep) # extract all exons from the gff, keep exons grouped by transcript maxScore=1 - for line in open(junctions, 'rt'): + infh = fileOps.opengz(junctions, 'rt') + for line in infh: if line.startswith('#'): continue [snaptron_id, chrom, start, end, length, strand, annotated, left_motif, right_motif, left_annotated, right_annotated, samples, samples_count, coverage_sum, coverage_avg, coverage_median, source_dataset_id] = line.rstrip().split('\t') # skip non-genome chromosomes if any(chrom.startswith(prefix) for prefix in ('ERCC', 'SIRV', 'GL', 'JH', 'chrEBV')): continue # turn left and right motifs into donor and acceptor donor = left_motif.upper() acc = right_motif.upper() if strand == '?': strand = '.' elif strand == '-': complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} donor = ''.join(complement[base] for base in reversed(acc)) acc = ''.join(complement[base] for base in reversed(left_motif.upper())) # link out to sample information for each junction link = urlbase + urlfill.format(jid=snaptron_id, chrom=chrom, start=start, end=end) # correct coordinates start = int(start)-1 end = int(end) # score scaling score = int(10*log(int(coverage_sum))) # score field only accepts integers maxScore = max(score, maxScore) - extraCols = [coverage_sum, samples_count, donor, acc, link] + size = end - start + extraCols = [size, coverage_sum, samples_count, donor, acc, link, donor + '/' + acc] # create bed object bedObj = Bed(chrom, start, end, name=snaptron_id, strand=strand, score=score, thickStart=start, thickEnd=end, numStdCols=9, extraCols=extraCols) # create splice site decorator file makeSpliceDecorator(bedObj, donor, acc, decFile) bedObj.write(outfile) decFile.close() print('maxScore', maxScore) if __name__ == "__main__": main()