c545301381d0663f19215c3642acfea9b24d407d jeltje.van.baren Tue Jan 7 13:09:28 2025 -0800 Yale pseudogene and parents composite track (pseudoPipe) diff --git src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py new file mode 100755 index 0000000..13aaf73 --- /dev/null +++ src/hg/makeDb/outside/pseudogenes/pseudoPipeParents.py @@ -0,0 +1,115 @@ +#!/usr/bin/env python3 +import sys +import csv +import os +import re +import argparse +import textwrap +from collections import namedtuple +sys.path.append('/hive/groups/browser/pycbio/lib') +from pycbio.hgdata.bed import Bed, BedBlock, intArraySplit, BedReader + +def main(): + parser = argparse.ArgumentParser( + formatter_class=argparse.RawDescriptionHelpFormatter, + description=textwrap.dedent('''\ +Read pgenes bed file and annotation files, construct parent track with links to child tracks. +Maybe update pgenes bed with parent links. + + ''')) + parser.add_argument('--pgenebed', type=str, help='ENST, ENSP, and hugo gene IDs') + parser.add_argument('--annotbed', type=str, required=True, default=None, help="annotation gtf, sorted by name then chrom") + parser.add_argument('--pgeneout', type=str, required=True, default=None, help="output pgene bedfile") + parser.add_argument('--parentout', type=str, required=True, default=None, help="output parent bedfile") + args = parser.parse_args() + + ensg = namedtuple('ensg', ['chrom', 'start', 'end', 'strand']) + # gene ID dict keeping track of starts and ends + ensgDict = dict() + curGene, start, end, curChr, strand = 'NA', 'NA', 'NA', 'NA', 'NA' + parentlink = '{geneID}' + with open(args.annotbed, 'r') as INF: + for bedObj in BedReader(INF, numStdCols=6): + if bedObj.name == curGene: + if(bedObj.chrom) == curChr: # skip chrY pseudoautosomal (for now?) + start = min(bedObj.start, int(start)) + end = max(bedObj.end, int(end)) + #else: + # print('skipping', bedObj, file=sys.stderr) + else: + ensgDict[curGene] = ensg(curChr, start, end, strand) + start = int(bedObj.start) + end = int(bedObj.end) + strand = bedObj.strand + curChr = bedObj.chrom + curGene = bedObj.name + # final one + ensgDict[curGene] = ensg(curChr, start, end, strand) + + # if a gene has a pseudogene, put it in the parent set + parentDict = dict() + with open(args.pgenebed, 'r') as INF, open(args.pgeneout, 'w') as OUTF: + for bedObj in BedReader(INF, numStdCols=12): + # parent ENSG + parent = bedObj.extraCols[4] + if parent == 'NA': + bedObj.extraCols[4] = 'parent not listed' + bedObj.write(OUTF) + continue + if not parent in ensgDict: + print('ERROR, missing', parent) + sys.exit() + # if we find a parent it's time to create a proper object and collect the children + parentTuple = ensgDict[parent] + if parent in parentDict: + parentDict[parent].add(bedObj) + else: + parentDict[parent] = Gene(parent, parentTuple, bedObj) + # add a link to the parent in the child bedObj + + url = parentlink.format(chrom=parentTuple.chrom, start=parentTuple.start, + end=parentTuple.end, geneID=parent) + # replace the parent ID with the URL to it + bedObj.extraCols[4] = url + bedObj.write(OUTF) + # now print parent track, adding one block per child for each parent + with open(args.parentout, 'w') as OUTF2: + for parentObj in parentDict.values(): + parentObj.write(OUTF2) + + + +class Gene(object): + def __init__(self, parent, parentTuple, childObj): + self.name = parent + self.chrom = parentTuple.chrom + self.geneStart = parentTuple.start + self.geneEnd = parentTuple.end + self.strand = parentTuple.strand + self.children = [childObj] + self.symbol = childObj.extraCols[0] + def add(self, childObj): + self.children.append(childObj) + def write(self, FH): + '''Create parent and children Bed objects, and print them''' + parentColor = "128,0,128" # Dark Purple + childColor = "192,192,192" # Gray + # mouseover link for children + baselink = '{pgenetype} ' + baselink += '{geneID} ({chrom}:{start}-{end})' + # let's make the parent wide and the children narrow + bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=self.name, strand=self.strand, + thickStart=self.geneStart, thickEnd=self.geneEnd, itemRgb=parentColor, numStdCols=9, + extraCols=[self.symbol, 'N/A']) + bedObj.write(FH) + for childObj in self.children: + url = baselink.format(pgenetype=childObj.extraCols[1], chrom=childObj.chrom, start=childObj.start, + end=childObj.end, geneID=childObj.name) + bedObj = Bed(self.chrom, self.geneStart, self.geneEnd, name=childObj.name, strand=self.strand, + itemRgb=childColor, numStdCols=9, extraCols=['', url]) + bedObj.write(FH) + + + +if __name__ == "__main__": + main()