c545301381d0663f19215c3642acfea9b24d407d
jeltje.van.baren
Tue Jan 7 13:09:28 2025 -0800
Yale pseudogene and parents composite track (pseudoPipe)
diff --git src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py
new file mode 100755
index 0000000..13aaf73
--- /dev/null
+++ src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py
@@ -0,0 +1,115 @@
+#!/usr/bin/env python3
+import sys
+import csv
+import os
+import re
+import argparse
+import textwrap
+from collections import namedtuple
+sys.path.append('/hive/groups/browser/pycbio/lib')
+from pycbio.hgdata.bed import Bed, BedBlock, intArraySplit, BedReader
+
+def main():
+ parser = argparse.ArgumentParser(
+ formatter_class=argparse.RawDescriptionHelpFormatter,
+ description=textwrap.dedent('''\
+Read pgenes bed file and annotation files, construct parent track with links to child tracks.
+Maybe update pgenes bed with parent links.
+
+ '''))
+ parser.add_argument('--pgenebed', type=str, help='ENST, ENSP, and hugo gene IDs')
+ parser.add_argument('--annotbed', type=str, required=True, default=None, help="annotation gtf, sorted by name then chrom")
+ parser.add_argument('--pgeneout', type=str, required=True, default=None, help="output pgene bedfile")
+ parser.add_argument('--parentout', type=str, required=True, default=None, help="output parent bedfile")
+ args = parser.parse_args()
+
+ ensg = namedtuple('ensg', ['chrom', 'start', 'end', 'strand'])
+ # gene ID dict keeping track of starts and ends
+ ensgDict = dict()
+ curGene, start, end, curChr, strand = 'NA', 'NA', 'NA', 'NA', 'NA'
+ parentlink = '{geneID}'
+ with open(args.annotbed, 'r') as INF:
+ for bedObj in BedReader(INF, numStdCols=6):
+ if bedObj.name == curGene:
+ if(bedObj.chrom) == curChr: # skip chrY pseudoautosomal (for now?)
+ start = min(bedObj.start, int(start))
+ end = max(bedObj.end, int(end))
+ #else:
+ # print('skipping', bedObj, file=sys.stderr)
+ else:
+ ensgDict[curGene] = ensg(curChr, start, end, strand)
+ start = int(bedObj.start)
+ end = int(bedObj.end)
+ strand = bedObj.strand
+ curChr = bedObj.chrom
+ curGene = bedObj.name
+ # final one
+ ensgDict[curGene] = ensg(curChr, start, end, strand)
+
+ # if a gene has a pseudogene, put it in the parent set
+ parentDict = dict()
+ with open(args.pgenebed, 'r') as INF, open(args.pgeneout, 'w') as OUTF:
+ for bedObj in BedReader(INF, numStdCols=12):
+ # parent ENSG
+ parent = bedObj.extraCols[4]
+ if parent == 'NA':
+ bedObj.extraCols[4] = 'parent not listed'
+ bedObj.write(OUTF)
+ continue
+ if not parent in ensgDict:
+ print('ERROR, missing', parent)
+ sys.exit()
+ # if we find a parent it's time to create a proper object and collect the children
+ parentTuple = ensgDict[parent]
+ if parent in parentDict:
+ parentDict[parent].add(bedObj)
+ else:
+ parentDict[parent] = Gene(parent, parentTuple, bedObj)
+ # add a link to the parent in the child bedObj
+
+ url = parentlink.format(chrom=parentTuple.chrom, start=parentTuple.start,
+ end=parentTuple.end, geneID=parent)
+ # replace the parent ID with the URL to it
+ bedObj.extraCols[4] = url
+ bedObj.write(OUTF)
+ # now print parent track, adding one block per child for each parent
+ with open(args.parentout, 'w') as OUTF2:
+ for parentObj in parentDict.values():
+ parentObj.write(OUTF2)
+
+
+
+class Gene(object):
+ def __init__(self, parent, parentTuple, childObj):
+ self.name = parent
+ self.chrom = parentTuple.chrom
+ self.geneStart = parentTuple.start
+ self.geneEnd = parentTuple.end
+ self.strand = parentTuple.strand
+ self.children = [childObj]
+ self.symbol = childObj.extraCols[0]
+ def add(self, childObj):
+ self.children.append(childObj)
+ def write(self, FH):
+ '''Create parent and children Bed objects, and print them'''
+ parentColor = "128,0,128" # Dark Purple
+ childColor = "192,192,192" # Gray
+ # mouseover link for children
+ baselink = '{pgenetype} '
+ baselink += '{geneID} ({chrom}:{start}-{end})'
+ # let's make the parent wide and the children narrow
+ bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=self.name, strand=self.strand,
+ thickStart=self.geneStart, thickEnd=self.geneEnd, itemRgb=parentColor, numStdCols=9,
+ extraCols=[self.symbol, 'N/A'])
+ bedObj.write(FH)
+ for childObj in self.children:
+ url = baselink.format(pgenetype=childObj.extraCols[1], chrom=childObj.chrom, start=childObj.start,
+ end=childObj.end, geneID=childObj.name)
+ bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=childObj.name, strand=self.strand,
+ itemRgb=childColor, numStdCols=9, extraCols=['', url])
+ bedObj.write(FH)
+
+
+
+if __name__ == "__main__":
+ main()