a95486084d48d3739b6d995a11f2cea55754f10c jeltje.van.baren Fri Apr 18 12:16:18 2025 -0700 better labeling and searching of Yale pseudogene track diff --git src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py index 952675c32cb..07d204432e4 100755 --- src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py +++ src/hg/makeDb/outside/pseudogenes/pseudoPipeToBed.py @@ -9,38 +9,39 @@ from pycbio.hgdata.bed import Bed, BedBlock, intArraySplit def main(): parser = argparse.ArgumentParser( formatter_class=argparse.RawDescriptionHelpFormatter, description=textwrap.dedent('''\ Converts gtf to bed format. ''')) parser.add_argument('--genes', type=str, help='ENST, ENSP, and hugo gene IDs') parser.add_argument('--gtf', type=str, required=True, default=None, help="annotation gtf") parser.add_argument('--bed', type=str, required=True, default=None, help="output bedfile") args = parser.parse_args() gdict = dict() + # get HUGO IDs with open(args.genes, 'r') as g: for line in g: ensg, gene, enst, ensp = line.split("\t") - ensg = ensg.split('.')[0] + ensg = enst.split('.')[0] gdict[ensg] = gene ensp = ensp.strip() if ensp != '': - gdict[ensp.split('.')[0]] = [gene, ensg] + gdict[ensp.split('.')[0]] = [gene, enst] g.close() gtf_to_bed(args.bed, args.gtf, gdict) class Gene(object): def __init__(self, itemRgb, tx, chrom, start, end, strand, extrafields): self.itemRgb = itemRgb self.txID = tx self.chrom = chrom self.geneStart = start self.geneEnd = end self.strand = strand self.blockList = [BedBlock(start, end)] self.extraCols = extrafields @@ -65,54 +66,65 @@ last_block = merged_blocks[-1] if block.start <= last_block.end: new_end = max(last_block.end, block.end) merged_blocks[-1] = BedBlock(last_block.start, new_end) else: # No overlap, so just add the current block merged_blocks.append(block) self.blocks = merged_blocks def ids_from_gtf(descrField, gdict): '''Parse the last field of a gtf to extract transcript and gene IDs''' pairs = descrField.split("; ") data_dict = {pair.split(" ", 1)[0].strip(): pair.split(" ", 1)[1].strip('"') for pair in pairs} # gene_id, transcript_id, gene_type, transcript_type, gene_ens_id, transcript_ens_id, gene_ens_id, gene_parent_id, transcript_parent_id, protein_parent_id - # note: data_dict['gene_type'] is always identical data_dict['transcript_type'], I checked. - #del data_dict['transcript_parent_id'] - hugo = 'NA' - if data_dict['gene_parent_id'] in gdict: - hugo = gdict[data_dict['gene_parent_id']] + del data_dict['transcript_type'] + del data_dict['exon'] + # as BED item name we will use the Yale ID (PGOHUMG0000290588) unless there's a HUGO ID + pgeneName = data_dict['transcript_id'] + # We want the HUGO ID for the pseudogene itself, if present, and for the parent. + pgeneHugo = 'NA' + parentHugo = 'NA' + if data_dict['transcript_ens_id'] in gdict: + pgeneHugo = gdict[data_dict['transcript_ens_id']] + pgeneName = pgeneHugo + elif data_dict['transcript_ens_id'] != 'NA': + print("WARNING, CANNOT FIND", data_dict['transcript_ens_id']) # doesn't happen + if data_dict['transcript_parent_id'] in gdict: + parentHugo = gdict[data_dict['transcript_parent_id']] # in some cases the transcript and gene parents are NA elif data_dict['protein_parent_id'] in gdict: - [hugo, ensg] = gdict[data_dict['protein_parent_id']] + [parentHugo, ensg] = gdict[data_dict['protein_parent_id']] # replace ensg parent with current - data_dict['gene_parent_id'] = ensg + data_dict['transcript_parent_id'] = ensg #if hugo == 'NA': # print(data_dict['gene_parent_id'], data_dict['protein_parent_id']) # 535 transcripts truly don't have parents so some remain NA - itemRgb = '255,140,0' # dark orange (pseudogene) - if data_dict['gene_type'] == 'unprocessed_pseudogene': + if data_dict['gene_type'] == 'pseudogene': + itemRgb = '255,140,0' # dark orange + data_dict['gene_type'] = 'unspecified_pseudogene' # clarify + elif data_dict['gene_type'] == 'unprocessed_pseudogene': itemRgb = '0,0,255' # blue elif data_dict['gene_type'] == 'processed_pseudogene': itemRgb = '85,107,47' # dark olive green - # drop transcript parent because we might have changed the gene parent - extrafields = tuple([hugo, data_dict['gene_type'], data_dict['gene_id'], - data_dict['gene_ens_id'], data_dict['gene_parent_id'], - data_dict['protein_parent_id']]) - return itemRgb, data_dict['transcript_id'], extrafields + extrafields = [pgeneHugo, parentHugo] + list(data_dict.values()) + #extrafields = tuple([hugo, data_dict['gene_type'], data_dict['gene_id'], + # data_dict['gene_ens_id'], data_dict['gene_parent_id'], + # data_dict['protein_parent_id']]) + return itemRgb, pgeneName, extrafields def gtf_to_bed(outputfile, gtf, gdict): geneObj = None with open(outputfile, 'wt') as outfile: writer = csv.writer(outfile, delimiter='\t', lineterminator=os.linesep) prev_transcript, blockstarts, blocksizes, prev_gene, prev_chrom, prev_strand = [None, None, None, None, None, None] blockList = [] # extract all exons from the gtf, keep exons grouped by transcript for line in open(gtf): if line.startswith('#'): continue line = line.rstrip().split('\t') chrom, ty, start, end, strand = line[0], line[2], int(line[3]) - 1, int(line[4]), line[6]