src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py a95486084d48d3739b6d995a11f2cea55754f10c

a95486084d48d3739b6d995a11f2cea55754f10c
jeltje.van.baren
  Fri Apr 18 12:16:18 2025 -0700
better labeling and searching of Yale pseudogene track

diff --git src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py
index 13aaf739a19..8e124965b97 100755
--- src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py
+++ src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py
@@ -1,115 +1,119 @@
 #!/usr/bin/env python3
 import sys
 import csv
 import os
 import re
 import argparse
 import textwrap
 from collections import namedtuple
 sys.path.append('/hive/groups/browser/pycbio/lib')
 from pycbio.hgdata.bed import Bed, BedBlock, intArraySplit, BedReader
 
 def main():
 	parser = argparse.ArgumentParser(
 	formatter_class=argparse.RawDescriptionHelpFormatter,
 	description=textwrap.dedent('''\
 Read pgenes bed file and annotation files, construct parent track with links to child tracks.
 Maybe update pgenes bed with parent links.
 
         '''))
 	parser.add_argument('--pgenebed', type=str, help='ENST, ENSP, and hugo gene IDs')
 	parser.add_argument('--annotbed', type=str, required=True, default=None,  help="annotation gtf, sorted by name then chrom")
 	parser.add_argument('--pgeneout', type=str, required=True, default=None,  help="output pgene bedfile")
 	parser.add_argument('--parentout', type=str, required=True, default=None,  help="output parent bedfile")
 	args = parser.parse_args()
 
-	ensg = namedtuple('ensg', ['chrom', 'start', 'end', 'strand'])
+	enst = namedtuple('enst', ['chrom', 'start', 'end', 'strand'])
+    # First read the annotation file
+    # we will be representing the parent genes as blocks, determining the start and end based on 
+    # the gene's transcripts in the annotation file
 	# gene ID dict keeping track of starts and ends
-	ensgDict = dict()
+	enstDict = dict()
 	curGene, start, end, curChr, strand = 'NA', 'NA', 'NA', 'NA', 'NA'
 	parentlink = '<a href="hgTracks?db=hg38&position={chrom}:{start}-{end}&parents=pack">{geneID}</a>'
 	with open(args.annotbed, 'r') as INF:
-		for bedObj in BedReader(INF, numStdCols=6):
-			if bedObj.name == curGene:
-				if(bedObj.chrom) == curChr:  # skip chrY pseudoautosomal (for now?)
-					start = min(bedObj.start, int(start))
-					end = max(bedObj.end, int(end))
-				#else:
-				#	print('skipping', bedObj, file=sys.stderr)
-			else:
-				ensgDict[curGene] = ensg(curChr, start, end, strand)
-				start = int(bedObj.start)
-				end = int(bedObj.end)
-				strand = bedObj.strand
-				curChr = bedObj.chrom
-				curGene = bedObj.name
-		# final one
-		ensgDict[curGene] = ensg(curChr, start, end, strand)
-
-	# if a gene has a pseudogene, put it in the parent set
+		for bedObj in BedReader(INF):
+			name = bedObj.name.split('.')[0]
+			enstDict[name] = bedObj
+    # We will create a bed file with parents and one overlapping block per child
+	# based on the pseudogene file: if a transcript has a pseudogene, get its information
+    # from enstDict and print a bed line for it, and one per child
+    # First we gather the information from the pseudogene file and output a
+    # new pseudogene file with a link to the parent transcript (where possible)
 	parentDict = dict()
 	with open(args.pgenebed, 'r') as INF, open(args.pgeneout, 'w') as OUTF:
 		for bedObj in BedReader(INF, numStdCols=12):
-			# parent ENSG
-			parent = bedObj.extraCols[4]
+			# parent ENST
+			parent = bedObj.extraCols[8].split('.')[0]
 			if parent == 'NA':
-				bedObj.extraCols[4] = 'parent not listed'
+				bedObj.extraCols.append('parent not listed')
 				bedObj.write(OUTF)
 				continue
-			if not parent in ensgDict:
+			if not parent in enstDict:
 				print('ERROR, missing', parent)
 				sys.exit()
 			# if we find a parent it's time to create a proper object and collect the children
-			parentTuple = ensgDict[parent]
+			parentObj = enstDict[parent]
 			if parent in parentDict:
 				parentDict[parent].add(bedObj)
 			else:
-				parentDict[parent] = Gene(parent, parentTuple, bedObj)
+				parentDict[parent] = Gene(parent, parentObj, bedObj)
 			# add a link to the parent in the child bedObj
 
-			url = parentlink.format(chrom=parentTuple.chrom, start=parentTuple.start,
-				end=parentTuple.end, geneID=parent) 
-			# replace the parent ID with the URL to it
-			bedObj.extraCols[4] = url
+			url = parentlink.format(chrom=parentObj.chrom, start=parentObj.start,
+				end=parentObj.end, geneID=parent) 
+			bedObj.extraCols.append(url)
 			bedObj.write(OUTF)
 	# now print parent track, adding one block per child for each parent
 	with open(args.parentout, 'w') as OUTF2:
 		for parentObj in parentDict.values():
 			parentObj.write(OUTF2)
 
 
 
 class Gene(object):
-	def __init__(self, parent, parentTuple, childObj):
+	'''Parent object'''
+	def __init__(self, parent, parentObj, childObj):
 		self.name = parent
-		self.chrom = parentTuple.chrom
-		self.geneStart = parentTuple.start
-		self.geneEnd = parentTuple.end
-		self.strand = parentTuple.strand
+		self.parentBedObj = parentObj
 		self.children = [childObj]
-		self.symbol = childObj.extraCols[0]
+		self.symbol = childObj.extraCols[1]
 	def add(self, childObj):
 		self.children.append(childObj)
 	def write(self, FH):
 		'''Create parent and children Bed objects, and print them'''
 		parentColor = "128,0,128"   # Dark Purple
 		childColor = "192,192,192"  # Gray
-		# mouseover link for children
-		baselink = '{pgenetype} <a href="hgTracks?db=hg38&position={chrom}:{start}-{end}&pseudogenes=pack">'
-		baselink += '{geneID}</a> ({chrom}:{start}-{end})'
-		# let's make the parent wide and the children narrow
-		bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=self.name, strand=self.strand, 
-		thickStart=self.geneStart, thickEnd=self.geneEnd, itemRgb=parentColor, numStdCols=9, 
-		extraCols=[self.symbol, 'N/A'])
-		bedObj.write(FH)
+		# extrafields are gene(symbol), gene type, and coordinates
+		# for the parent, this is a series of coordinates, one per child
+
+		# The children (blocks below the parent) get a mouseover pointing to their location 
+		coordURL = '<a href="hgTracks?db=hg38&position={chrom}:{start}-{end}&pseudogenes=pack">'
+		coordURL += '{chrom}:{start}-{end}</a>'
+		# mouseover in the parent gives the children's IDs with links to their locations
+		idURL = '<a href="hgTracks?db=hg38&position={chrom}:{start}-{end}&pseudogenes=pack">'
+		idURL += '{childId}</a> '
+		# print the children as blocks (we don't know how much they overlap)
+		childObjects = []
+		coordString = ''
 		for childObj in self.children:
-			url = baselink.format(pgenetype=childObj.extraCols[1], chrom=childObj.chrom, start=childObj.start, 
-				end=childObj.end, geneID=childObj.name)
-			bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=childObj.name, strand=self.strand, 
-			itemRgb=childColor, numStdCols=9, extraCols=['', url])
-			bedObj.write(FH)
+			coords_html = coordURL.format(chrom=childObj.chrom, start=childObj.start, end=childObj.end)
+			# create bedObject to write
+			bedObj = Bed(self.parentBedObj.chrom, self.parentBedObj.chromStart, self.parentBedObj.chromEnd, 
+			name=childObj.name, strand=self.parentBedObj.strand, 
+			itemRgb=childColor, numStdCols=12, extraCols=[childObj.name, childObj.extraCols[4], coords_html])
+			childObjects.append(bedObj)
+			# append child coordinate link to string (for adding to the parent)
+			coordString += idURL.format(chrom=childObj.chrom, start=childObj.start,
+				end=childObj.end, childId=childObj.name)
+		# update the parent with children links and print
+		self.parentBedObj.extraCols=[self.symbol, 'gene', coordString]
+		self.parentBedObj.itemRgb = parentColor
+		self.parentBedObj.write(FH)
+		# now print children
+		[bedObj.write(FH) for bedObj in childObjects]
 
 
 
 if __name__ == "__main__":
 	main()