src/hg/makeDb/outside/encode4/gtfToBed.py 994049f75b593264f0ff047f88903468bf1c2031

994049f75b593264f0ff047f88903468bf1c2031
jeltje.van.baren
  Fri Apr 4 13:35:35 2025 -0700
mm10 and hg38 encode4

diff --git src/hg/makeDb/outside/encode4/gtfToBed.py src/hg/makeDb/outside/encode4/gtfToBed.py
new file mode 100755
index 00000000000..379cebb5c8d
--- /dev/null
+++ src/hg/makeDb/outside/encode4/gtfToBed.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python3
+import sys
+import csv
+import os
+import re
+import csv
+import argparse
+import textwrap
+from collections import namedtuple
+sys.path.append('/hive/groups/browser/pycbio/lib')
+from pycbio.hgdata.bed import Bed, BedBlock, BedReader, intArraySplit
+
+quantInfo = namedtuple('quantInfo', ['topvals', 'quantTable'])
+
+
+def main():
+	parser = argparse.ArgumentParser(
+	formatter_class=argparse.RawDescriptionHelpFormatter,
+	description=textwrap.dedent('''\
+Converts gtf to bed format, adding expression values and CDS from separate files.
+
+        '''))
+	required = parser.add_argument_group('required named arguments')
+	required.add_argument('gtf', type=str, help='annotated gtf')
+	required.add_argument('quant', type=str, help='quantification file')
+	required.add_argument('prot', type=str, help='file with ORF info')
+	required.add_argument('samples', type=str, help='sample info file')
+	required.add_argument('bed', type=str, help='bed output file')
+	args = parser.parse_args()
+
+	sampleDict = addSampleInfo(args.samples)
+	exprDict = exprValues(args.quant, sampleDict)
+	cdsDict = cdsValues(args.prot)
+	gtf_to_bed(args.bed, args.gtf, cdsDict, exprDict)
+
+
+class Gene(object):
+	'''Object to gather exons for a transcript; then convert to bed format and write'''
+	def __init__(self, data_dict, tx, chrom, start, end, strand, source):
+		# these come from the final field of the gtf file
+		extrafields = tuple([data_dict['gene_id'], data_dict['gene_name'], data_dict['transcript_id'], 
+			data_dict['transcript_name']])
+		self.txID = tx
+		self.chrom = chrom
+		self.geneStart = start
+		self.geneEnd = end
+		self.strand = strand
+		self.blocks = [BedBlock(start, end)]
+		self.itemRgb = '0,0,0'
+		if source.startswith('TALON'):
+			self.itemRgb = '0,0,255'  # blue
+		self.extraCols = (source,) + extrafields 
+	def add(self, start, end):
+		self.geneStart = min(start, self.geneStart)
+		self.geneEnd = max(end, self.geneEnd)
+		self.blocks.append(BedBlock(start, end))
+	def write(self, FH, exprDict, cdsDict):
+		'''Create a Bed object, add expression and cdsInfo then write'''
+		name = self.txID
+		if name in exprDict:
+			self.extraCols = self.extraCols + exprDict[name]
+		else:
+			print('no expression:', name)
+			self.extraCols = self.extraCols + tuple(['no expression data', 'no expression data'])
+		if name in cdsDict:
+			thickStart = cdsDict[name][0]
+			thickEnd   = cdsDict[name][1]
+		else:
+			print('no cds:', name)
+			thickStart = thickEnd = self.geneEnd
+		self.blocks.sort()
+		bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=name, strand=self.strand, 
+		thickStart=thickStart, thickEnd=thickEnd,
+		blocks=self.blocks, itemRgb=self.itemRgb, numStdCols=12, extraCols = self.extraCols)
+		bedObj.write(FH)
+
+def cdsValues(cdsfile):
+	'''Return dictionary with ORFstart and end tuples'''
+	cdsDict = dict()
+	f = open(cdsfile, 'r')
+	reader = csv.DictReader(f, delimiter='\t')
+	for row in reader:
+		cdsDict[row['tid']] = [int(row['CDS_Start'])-1, int(row['CDS_Stop'])]
+	f.close()
+	return cdsDict
+	
+def addSampleInfo(samplefile):
+	'''Clarify sample IDs, adding cell line and alzheimers info'''
+	sampleDict = dict()
+	f = open(samplefile, 'r')
+	reader = csv.DictReader(f, delimiter='\t')
+	for row in reader:
+		sample = row['sample']
+		if row['tissue_or_cell_line'] == "cell_line":
+			sampleDict[sample] = f'{sample} (cell line)' 
+		elif sample == 'brain_ad':
+			sampleDict[sample] = 'brain (Alzheimer\'s)'
+		else:
+			sampleDict[sample] = sample
+	f.close()
+	return sampleDict
+
+def exprValues(quantfile, sampleDict):
+	'''Create dictionary with transcript ID as key and top values, and all values as namedtuple'''
+	exprDict = dict()
+	f = open(quantfile, 'r')
+	header = f.readline()
+	header = header.rstrip().split("\t")[1:]
+	# clean up sample IDs
+	header = [sampleDict[x] for x in header]
+	fieldct = len(header)
+	for line in f:
+		fields = line.strip().split("\t")
+		tx = fields[0]
+		values = [round(float(x),2) for x in fields[1:]]
+		# create a html table with values for all samples in one field of the bed file
+		html_table = '<table>'
+		for i in range(fieldct):
+			html_table += f'  <tr><td>{header[i]}</td><td>{values[i]}</td></tr>'
+		html_table += '</table>'
+		# find samples with 10% highest values
+		topValString = '0\tNo expression measured'
+		named_values = list(zip(header, values))
+		maxval = max(values)
+		# create two columns: the raw value and the html string
+		if maxval > 0:
+			threshold = 0.9 * maxval
+			max_entries = {column: value for column, value in named_values if value > threshold}
+			topValString = f'{maxval}\tMax score(s) in <br>'
+			topValString += '<br>'.join([f'{column}: {value}' for column, value in max_entries.items()])
+		exprDict[tx] = quantInfo(topvals=topValString, quantTable=html_table)
+	f.close()
+	return exprDict
+
+def gtf_to_bed(outputfile, gtf, cdsDict, exprDict):
+	'''Parses exon lines, extracts ID info from last field, adds CDS and expression info'''
+	geneObj = None
+	seen = set()
+	with open(outputfile, 'wt') as outfile:
+		writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep)
+
+		prev_transcript, blockstarts, blocksizes, prev_gene, prev_chrom, prev_strand = [None, None, None, None, None, None]
+		# extract all exons from the gtf, keep exons grouped by transcript
+		for line in open(gtf, 'rt'):  
+			if line.startswith('#'):
+				continue
+			line = line.rstrip().split('\t')
+			chrom, source, ty, start, end, strand, descrField = line[0], line[1], line[2], int(line[3]) - 1, int(line[4]), line[6], line[8]
+			if chrom == 'chrEBV':  # not in genome
+				continue
+			if ty in ['gene', 'transcript']:
+				continue
+			# parse last field for ID
+			pairs = descrField.split("; ")
+			data_dict = {pair.split(" ", 1)[0].strip(): pair.split(" ", 1)[1].strip('"') for pair in pairs}
+			ensg = data_dict['gene_id'].split('.')[0] 
+			# transcript_id is unique but can be ugly ("ENST00000459772.5,ENST00000459772.5#2"), 
+			# so only use it for the name field
+			transcript_id = data_dict['transcript_id']
+			if geneObj is None:
+				geneObj = Gene(data_dict, transcript_id, chrom, start, end, strand, source)
+			elif geneObj.txID == transcript_id:
+				if ty == 'exon':
+					geneObj.add(start, end)
+			# this line is a new gene, so print the previous
+			else:
+				geneObj.write(outfile, exprDict, cdsDict)
+				# start a new gene
+				geneObj = Gene(data_dict, transcript_id, chrom, start, end, strand, source)
+
+		# last entry
+		geneObj.write(outfile, exprDict, cdsDict)
+
+
+if __name__ == "__main__":
+    main()