src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py a95486084d48d3739b6d995a11f2cea55754f10c

a95486084d48d3739b6d995a11f2cea55754f10c
jeltje.van.baren
  Fri Apr 18 12:16:18 2025 -0700
better labeling and searching of Yale pseudogene track

diff --git src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py
index 952675c32cb..07d204432e4 100755
--- src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py
+++ src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py
@@ -9,38 +9,39 @@
 from pycbio.hgdata.bed import Bed, BedBlock, intArraySplit
 
 def main():
 	parser = argparse.ArgumentParser(
 	formatter_class=argparse.RawDescriptionHelpFormatter,
 	description=textwrap.dedent('''\
 Converts gtf to bed format.
 
         '''))
 	parser.add_argument('--genes', type=str, help='ENST, ENSP, and hugo gene IDs')
 	parser.add_argument('--gtf', type=str, required=True, default=None,  help="annotation gtf")
 	parser.add_argument('--bed', type=str, required=True, default=None,  help="output bedfile")
 	args = parser.parse_args()
 
 	gdict = dict()
+	# get HUGO IDs
 	with open(args.genes, 'r') as g:
 		for line in g:
 			ensg, gene, enst, ensp = line.split("\t")
-			ensg = ensg.split('.')[0]
+			ensg = enst.split('.')[0]
 			gdict[ensg] = gene
 			ensp = ensp.strip()
 			if ensp != '':
-				gdict[ensp.split('.')[0]] = [gene, ensg]
+				gdict[ensp.split('.')[0]] = [gene, enst]
                     
 	g.close()
 	gtf_to_bed(args.bed, args.gtf, gdict)
 
 
 class Gene(object):
 	def __init__(self, itemRgb, tx, chrom, start, end, strand, extrafields):
 		self.itemRgb = itemRgb
 		self.txID = tx
 		self.chrom = chrom
 		self.geneStart = start
 		self.geneEnd = end
 		self.strand = strand
 		self.blockList = [BedBlock(start, end)]
 		self.extraCols = extrafields
@@ -65,54 +66,65 @@
 		            last_block = merged_blocks[-1]
 		            if block.start <= last_block.end:
 		                new_end = max(last_block.end, block.end)
 		                merged_blocks[-1] = BedBlock(last_block.start, new_end)
 		            else:
 		                # No overlap, so just add the current block
 		                merged_blocks.append(block)
 		self.blocks = merged_blocks
 
 
 def ids_from_gtf(descrField, gdict):
 	'''Parse the last field of a gtf to extract transcript and gene IDs'''
 	pairs = descrField.split("; ")
 	data_dict = {pair.split(" ", 1)[0].strip(): pair.split(" ", 1)[1].strip('"') for pair in pairs}
 	# gene_id, transcript_id, gene_type, transcript_type, gene_ens_id, transcript_ens_id, gene_ens_id, gene_parent_id, transcript_parent_id, protein_parent_id
-	# note: data_dict['gene_type'] is always identical data_dict['transcript_type'], I checked.
-	#del data_dict['transcript_parent_id']
-	hugo = 'NA'
-	if data_dict['gene_parent_id'] in gdict:
-		hugo = gdict[data_dict['gene_parent_id']]
+	del data_dict['transcript_type']
+	del data_dict['exon']
+	# as BED item name we will use the Yale ID (PGOHUMG0000290588) unless there's a HUGO ID
+	pgeneName = data_dict['transcript_id']
+	# We want the HUGO ID for the pseudogene itself, if present, and for the parent.
+	pgeneHugo = 'NA'
+	parentHugo = 'NA'
+	if data_dict['transcript_ens_id'] in gdict:
+		pgeneHugo = gdict[data_dict['transcript_ens_id']]
+		pgeneName = pgeneHugo
+	elif data_dict['transcript_ens_id'] != 'NA':
+		print("WARNING, CANNOT FIND", data_dict['transcript_ens_id']) # doesn't happen
+	if data_dict['transcript_parent_id'] in gdict:
+		parentHugo = gdict[data_dict['transcript_parent_id']]
 	# in some cases the transcript and gene parents are NA
 	elif data_dict['protein_parent_id'] in gdict:
-		[hugo, ensg]  = gdict[data_dict['protein_parent_id']]
+		[parentHugo, ensg]  = gdict[data_dict['protein_parent_id']]
 		# replace ensg parent with current
-		data_dict['gene_parent_id'] = ensg
+		data_dict['transcript_parent_id'] = ensg
 
 	#if hugo == 'NA':
 	#	print(data_dict['gene_parent_id'], data_dict['protein_parent_id'])
 	# 535 transcripts truly don't have parents so some remain NA
-	itemRgb = '255,140,0' # dark orange (pseudogene)
-	if data_dict['gene_type'] == 'unprocessed_pseudogene':
+	if data_dict['gene_type'] == 'pseudogene':
+		itemRgb = '255,140,0' # dark orange 
+		data_dict['gene_type'] = 'unspecified_pseudogene' # clarify
+	elif data_dict['gene_type'] == 'unprocessed_pseudogene':
 		itemRgb = '0,0,255' # blue
 	elif data_dict['gene_type'] == 'processed_pseudogene':
 		itemRgb = '85,107,47' # dark olive green 
-	# drop transcript parent because we might have changed the gene parent
-	extrafields = tuple([hugo, data_dict['gene_type'], data_dict['gene_id'],  
-		data_dict['gene_ens_id'], data_dict['gene_parent_id'], 
-		data_dict['protein_parent_id']])
-	return itemRgb, data_dict['transcript_id'], extrafields
+	extrafields = [pgeneHugo, parentHugo] + list(data_dict.values())
+	#extrafields = tuple([hugo, data_dict['gene_type'], data_dict['gene_id'],  
+	#	data_dict['gene_ens_id'], data_dict['gene_parent_id'], 
+	#	data_dict['protein_parent_id']])
+	return itemRgb, pgeneName, extrafields
 
 
 def gtf_to_bed(outputfile, gtf, gdict):
 	geneObj = None
 	with open(outputfile, 'wt') as outfile:
 		writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep)
 
 		prev_transcript, blockstarts, blocksizes, prev_gene, prev_chrom, prev_strand = [None, None, None, None, None, None]
 		blockList = []
 		# extract all exons from the gtf, keep exons grouped by transcript
 		for line in open(gtf):  
 			if line.startswith('#'):
 				continue
 			line = line.rstrip().split('\t')
 			chrom, ty, start, end, strand = line[0], line[2], int(line[3]) - 1, int(line[4]), line[6]