003337bf766f61f102c61cb924af98ad09372ad0
markd
Tue Aug 5 12:41:24 2025 -0700
added size and splice junction filters to recount3
diff --git src/hg/makeDb/outside/recount3/junctionsToBed.py src/hg/makeDb/outside/recount3/junctionsToBed.py
index 0600aa0cbb9..f9c3c321d32 100755
--- src/hg/makeDb/outside/recount3/junctionsToBed.py
+++ src/hg/makeDb/outside/recount3/junctionsToBed.py
@@ -1,28 +1,28 @@
#!/usr/bin/env python3
import sys
import csv
import os
import re
import csv
import argparse
import textwrap
from collections import namedtuple
from math import log
-sys.path.append('/hive/groups/browser/pycbio/lib')
+#sys.path.append('/hive/groups/browser/pycbio/lib')
from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit
-
+from pycbio.sys import fileOps
def main():
parser = argparse.ArgumentParser(
formatter_class=argparse.RawDescriptionHelpFormatter,
description=textwrap.dedent('''\
Converts gff to bed format, adding expression values and CDS from separate files.
Also outputs a decorator bed file, see https://genome.ucsc.edu/goldenPath/help/decorator.html
'''))
#group = parser.add_argument_group('required arguments')
parser.add_argument('--junctions', required=True, type=str, help='annotated gff')
parser.add_argument('--bed', required=True, type=str, help='output bed')
parser.add_argument('--decorator', required=True, type=str, help='output decorator bed')
parser.add_argument('--compilation', required=True, type=str,
@@ -71,55 +71,57 @@
extraCols=extraCols)
rightObj.write(outFH)
def junction_to_bed(outputfile, junctions, decoratorfile, compilation):
'''Parses intron lines, extracts ID info from last field, assigns score'''
urlbase = 'link to snaptron'
#'https://snaptron.cs.jhu.edu/snaptron-studies/jxn2studies?compilation=&jid=&coords='
# url = '{text}'
decFile = open(decoratorfile, 'wt')
with open(outputfile, 'wt') as outfile:
writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep)
# extract all exons from the gff, keep exons grouped by transcript
maxScore=1
- for line in open(junctions, 'rt'):
+ infh = fileOps.opengz(junctions, 'rt')
+ for line in infh:
if line.startswith('#'):
continue
[snaptron_id, chrom, start, end, length, strand, annotated,
left_motif, right_motif, left_annotated, right_annotated, samples,
samples_count, coverage_sum, coverage_avg, coverage_median,
source_dataset_id] = line.rstrip().split('\t')
# skip non-genome chromosomes
if any(chrom.startswith(prefix) for prefix in ('ERCC', 'SIRV', 'GL', 'JH', 'chrEBV')):
continue
# turn left and right motifs into donor and acceptor
donor = left_motif.upper()
acc = right_motif.upper()
if strand == '?':
strand = '.'
elif strand == '-':
complement = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
donor = ''.join(complement[base] for base in reversed(acc))
acc = ''.join(complement[base] for base in reversed(left_motif.upper()))
# link out to sample information for each junction
link = urlbase + urlfill.format(jid=snaptron_id, chrom=chrom, start=start, end=end)
# correct coordinates
start = int(start)-1
end = int(end)
# score scaling
score = int(10*log(int(coverage_sum))) # score field only accepts integers
maxScore = max(score, maxScore)
- extraCols = [coverage_sum, samples_count, donor, acc, link]
+ size = end - start
+ extraCols = [size, coverage_sum, samples_count, donor, acc, link, donor + '/' + acc]
# create bed object
bedObj = Bed(chrom, start, end, name=snaptron_id, strand=strand, score=score,
thickStart=start, thickEnd=end, numStdCols=9, extraCols=extraCols)
# create splice site decorator file
makeSpliceDecorator(bedObj, donor, acc, decFile)
bedObj.write(outfile)
decFile.close()
print('maxScore', maxScore)
if __name__ == "__main__":
main()