5d72bddf0393f961e00b2b354457fdcc8e62bc8d lrnassar Thu May 22 16:36:33 2025 -0700 Updating the panelApp otto job to simplify the overall script from 3 seprate ones to a single call, and also adding support for the similar (but not identical) panelApp australia resource, refs #35758 diff --git src/hg/utils/otto/panelApp/doPanelApp.py src/hg/utils/otto/panelApp/doPanelApp.py new file mode 100755 index 00000000000..9d634e6af7b --- /dev/null +++ src/hg/utils/otto/panelApp/doPanelApp.py @@ -0,0 +1,982 @@ +#!/hive/data/outside/otto/panelApp/venv/bin/python3 +from datetime import date +import os +import shutil +import requests +import time +import json +import pandas as pd +import sys +import re +import gzip +import logging + +def getArchDir(db): + " return hgwdev archive directory given db " + dateStr = date.today().strftime("%Y-%m-%d") + archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/panelApp/%s" % (db, dateStr) + if not os.path.isdir(archDir): + os.makedirs(archDir) + return archDir + +def writeBb(hg19Table, hg38Table, subTrack): + " sort the pandas tables, write to BED and convert " + for db in ["hg19", "hg38"]: + archDir = getArchDir(db) + + bedFname = "current/%s/%s.bed" % (db, subTrack) + bbFname = "current/%s/%s.bb.tmp" % (db, subTrack) + + if db=="hg19": + pdTable = hg19Table + else: + pdTable = hg38Table + + # for cnvs, one of the arguments can be None + if pdTable is None: + continue + + pdTable.sort_values(by=['chrom','chromStart'], ascending = (True, True), inplace=True) + pdTable = pdTable.applymap(lambda x: x.replace('\t', ' ') if isinstance(x, str) else x) + pdTable.to_csv(bedFname, sep='\t', index=False, header=None) + + asFname = subTrack+".as" + + # -extraIndex=geneName + cmd = "bedToBigBed -tab -as=%s -type=bed9+26 %s /hive/data/genomes/%s/chrom.sizes %s" % (asFname, bedFname, db, bbFname) + assert(os.system(cmd)==0) + + # put a copy into the archive + archBbFname = archDir+"/%s.bb" % subTrack + shutil.copyfile(bbFname, archBbFname) + +def updateGbdbSymlinks(country): + " update the symlinks in /gbdb. Not really necessary but kept this code just in case. " + if country == "Australia": + subtracks = ["genesAus", "tandRepAus", "cnvAus"] + else: + subtracks = ["genes", "tandRep", "cnv"] + for db in ["hg19", "hg38"]: + archDir = getArchDir(db) + for subTrack in subtracks: + if db=="hg19" and "cnv" in subTrack: + continue # no cnv on hg19 + cmd = "ln -sf `pwd`/current/%s/%s.bb /gbdb/%s/panelApp/%s.bb" % (db, subTrack, db, subTrack) + assert(os.system(cmd)==0) + +def flipFiles(country): + " rename the .tmp files to the final filenames " + if country == "Australia": + subtracks = ["genesAus", "tandRepAus", "cnvAus"] + else: + subtracks = ["genes", "tandRep", "cnv"] + for db in ["hg19", "hg38"]: + archDir = getArchDir(db) + for subTrack in subtracks: + if db=="hg19" and "cnv" in subTrack: + # no cnvs for hg19 yet + continue + oldFname = "current/%s/%s.bb.tmp" % (db, subTrack) + newFname = "current/%s/%s.bb" % (db, subTrack) + os.replace(oldFname, newFname) + +def getAllPages(url, results=[]): + " recursively download all pages. Stack should be big enough " + try: + myResponse = requests.get(url) + if (myResponse.ok): + jData = json.loads(myResponse.content.decode()) + # If jData is empty create, else append + if "error" in jData.keys() or not "results" in jData.keys(): + raise Exception("Error in keys when downloading %s" % url) + + if "count" in jData and not "page" in url: + print("API says that there are %d results for url %s" % (jData["count"], url)) + results.extend(jData["results"]) + + if "next" in jData and jData["next"] is not None: # need to get next URL + return getAllPages(jData["next"], results) + else: + raise Exception("Error in object when downloading %s" % url) + except: + raise Exception("HTTP Error when downloading %s" % url) + return results + +def downloadCnvs(url): + Error = True + continuous_count=0 + res = getAllPages(url, results=[]) + + num_gene_data = len(res) + print("Got %d CNVs" % num_gene_data) + count = 0 + continuous_count = 0 + hg19_dict = dict() + hg38_dict = dict() + + for geneCount, cnvData in enumerate(res): + temp_attribute_dictionary = dict() + string_dict_key = 'gene_{}'.format(geneCount) + + chromo = cnvData['chromosome'] + chromosome = 'chr' + chromo + + start_coordinates = cnvData['grch38_coordinates'][0] + end_coordinates = cnvData['grch38_coordinates'][1] + + score = '0' + strand = '.' + thickStart = start_coordinates + thickEnd = end_coordinates + blockCount = '1' + blockSizes = int(end_coordinates) - int(start_coordinates) + blockStarts = 0 + + confidence_level = cnvData['confidence_level'] + + rgb_dict = {'0' : '100,100,100', '3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} + itemRgb = rgb_dict[confidence_level] + + entity_name = cnvData['entity_name'] + entity_type = cnvData['entity_type'] + evidence = ' '.join(cnvData['evidence']) + + haploinsufficiency_score = cnvData.get('haploinsufficiency_score') + if not haploinsufficiency_score: + haploinsufficiency_score = '' + + moi = cnvData.get('mode_of_inheritance') + if not moi: + moi = '' + + disease_group = cnvData['panel'].get('disease_group') + if not disease_group: + disease_group = '' + + disease_sub_group = cnvData['panel'].get('disease_sub_group') + if not disease_sub_group: + disease_sub_group = '' + + # idd = Panel ID + idd = cnvData['panel'].get('id') + if not idd: + idd = '' + + panel_name = cnvData['panel'].get('name') + if not panel_name: + panel_name = '' + + relevant_disorders = ' '.join(cnvData['panel'].get('relevant_disorders', [])) + if not relevant_disorders: + relevant_disorders = '' + + status = cnvData['panel'].get('status') + if not status: + status = '' + + ''' + types = cnvData['panel'].get['types') + if not types: + types = '' + else: + types = str(types).replace("{","").replace("}", "").replace("'", "") + types = types[1:-1] + ''' + + types = cnvData['panel']['types'][0].get('name') + + version = cnvData['panel']['version'] + if 'genomicsengland' in url: + if float(version) < 0.99: + continue + if not version: + version = '' + else: + if not version: + version = '' + + penetrance = cnvData.get('penetrance') + if not penetrance: + penetrance = '' + + phenotypes = ' '.join(cnvData.get('phenotypes', [])) + if not phenotypes: + phenotypes = '' + + publications = ' '.join(cnvData['publications']) + if not publications: + publications = '' + + #required_overlap_percentage = cnvData['required_overlap_percentage'] + tags = cnvData['tags'] + if not tags: + tags = '' + + triplosensitivity_score = cnvData.get('triplosensitivity_score') + if not triplosensitivity_score: + triplosensitivity_score = '' + + type_of_variants = None + if "type_of_variants" in cnvData: + type_of_variants = cnvData['type_of_variants'] + if not type_of_variants: + type_of_variants = '' + + verbose_name = cnvData.get('verbose_name') + if not verbose_name: + verbose_name = '' + + # Mouse Over Field + mouseOverField = "" + try: + mof = 'Gene: ' + entity_name + ';' + ' Panel: ' + name + ';' + ' MOI: ' + moi + ';' + ' Phenotypes: ' + phenotypes + ';' + ' Confidence: ' + confidence_level + ';' + mouseOverField = mof + except: + mouseOverField = '' + + # name + name = '{} ({})'.format(entity_name, panel_name) + + hg38_dict[string_dict_key] = [chromosome, start_coordinates, end_coordinates, name, score, strand, + thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts, confidence_level, + panel_name, idd, entity_name, entity_type, evidence, haploinsufficiency_score, moi, disease_group, + disease_sub_group, relevant_disorders, status, types, version, penetrance, phenotypes, + publications, triplosensitivity_score, type_of_variants, verbose_name, mouseOverField] + + #------------------------------------------------------------------------------- + + continuous_count = continuous_count + 1 + #count = count + 1 + + # Removes new lines + for key, item in hg38_dict.items(): + strip_list = list() + for i in item: + try: + strip_list.append(i.replace('\t', ' ').strip().strip("\n").strip("\r")) + except: + strip_list.append(i) + hg38_dict[key] = strip_list + + pd_38_table = pd.DataFrame.from_dict(hg38_dict) + pd_38_table = pd_38_table.T + pd_38_table.columns = ['chrom', 'chromStart', 'End', 'name', 'Score', 'strand', 'thickStart', 'thickEnd', + 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'Confidence Level', + 'Panel Name', 'Panel ID', 'Entity Name', 'Entity Type', 'Evidence', 'ClinGen Haploinsufficiency Score', + 'Mode of Inheritance', 'Disease Group', 'Disease Sub Group', 'Relevant Disorders', + 'Status', 'Types', 'Version Created', 'Penetrance', 'Phenotypes', 'Publications', + 'CinGen Triplosensitivity Score', 'Type of Variants', 'Verbose Name', 'Mouse Over Field'] + #pd_38_table = pd_38_table.sort_values(by=['chromosome', 'Start'], ascending = (True, True)) + #pd_38_table.to_csv('hg38_region_noheader_sorted.tsv', sep='\t', index=False, header=None) + #pd_38_table.to_csv('hg38_region_header_sorted.tsv', sep='\t', index=False) + return pd_38_table + +def downloadTandReps(url): + Error = True + continuous_count=0 + res = getAllPages(url, results=[]) + + num_gene_data = len(res) + print("Got %d tandem repeats from API" % num_gene_data) + + count = 0 + continuous_count = 0 + hg19_dict = dict() + hg38_dict = dict() + + while count != num_gene_data: + #if count == 10: + # break + string_dict_key = 'gene_{}'.format(continuous_count) + + temp_attribute_dictionary = dict() + chromosome = res[count]['chromosome'] + chromosome = 'chr{}'.format(chromosome) + confidence_level = res[count]['confidence_level'] + entity_name = res[count]['entity_name'] + entity_type = res[count]['entity_type'] + evidence = ' '.join(res[count]['evidence']) + + gene_data = res[count]['gene_data'] + if gene_data: + alias = ' '.join(gene_data.get('alias', [])) + biotype = gene_data['biotype'] + else: + alias = "" + biotype = "" + try: + ensembl_id_37 = gene_data['ensembl_genes']['GRch37']['82']['ensembl_id'] + except: + ensembl_id_37 = "None" + try: + ensembl_id_38 = gene_data['ensembl_genes']['GRch38']['90']['ensembl_id'] + except: + ensembl_id_38 = "None" + + gene_name = gene_data['gene_name'] + #gene_symbol = gene_data['gene_symbol'] + #hgnc_date_symbol_changed = gene_data['hgnc_date_symbol_changed'] + hgnc_id = gene_data['hgnc_id'] + hgnc_symbol = gene_data['hgnc_symbol'] + if str(gene_data['omim_gene']) == "None": + omim_gene = "None" + else: + omim_gene = ' '.join(gene_data['omim_gene']) + grch37_coordinates = res[count]['grch37_coordinates'] + if grch37_coordinates == None: + coordinates = gene_data['ensembl_genes']['GRch37']['82']['location'] + location = coordinates.split(':') + grch37_coordinates = location[1].split('-') + chromStart_19 = int(grch37_coordinates[0]) + chromEnd_19 = int(grch37_coordinates[1]) + + # hg38 + grch38_coordinates = res[count]['grch38_coordinates'] + if grch38_coordinates == None: + coordinates = gene_data['ensembl_genes']['GRch38']['90']['location'] + location = coordinates.split(':') + grch38_coordinates = location[1].split('-') + + mode_of_inheritance = res[count]['mode_of_inheritance'] + normal_repeats = res[count]['normal_repeats'] + chromStart_38 = int(grch38_coordinates[0]) + chromEnd_38 = int(grch38_coordinates[1]) + + panel = res[count]['panel'] + disease_group = panel['disease_group'] + disease_sub_group = panel['disease_sub_group'] + hash_id = panel['hash_id'] + idd = panel['id'] + panel_name = panel['name'] + relevant_disorders = ' '.join(panel['relevant_disorders']) + #relevant_disorders = relevant_disorders[:240] + + stats = panel['stats'] + number_of_gene = stats['number_of_genes'] + number_of_regions = stats['number_of_regions'] + number_of_strs = stats['number_of_strs'] + + status = panel['status'] + #description = panel['types'][0]['description'][:240] + description = panel['types'][0]['description'] + version = panel['version'] + version_created = panel['version_created'] + pathogenic_repeats = res[count]['pathogenic_repeats'] + penetrance = res[count]['penetrance'] + phenotypes = ' '.join(res[count]['phenotypes']) + phenotypes_no_num = ''.join([i for i in phenotypes if not i.isdigit()]) + publications = ' '.join(res[count]['publications']) + repeated_sequence = res[count]['repeated_sequence'] + tags = ' '.join(res[count]['tags']) + + # Check to see if panel_name is not empty + if panel_name: + try: + panel_name = panel_name.split(' - ') + panel_name = panel_name[0] + name = '{} ({})'.format(hgnc_symbol, panel_name) + except: + name = '{} ({})'.format(hgnc_symbol, panel_name) + else: + name = hgnc_symbol + score = 0 + strand = '.' + thickStart_19 = chromStart_19 + thickEnd_19 = chromEnd_19 + thickStart_38 = chromStart_38 + thickEnd_38 = chromEnd_38 + + rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} + # If the confidence level is set to 0, set to 1 + if confidence_level == '0': + confidence_level = '1' + rgb = rgb_dict[confidence_level] + rgb = rgb.strip('"') + blockCount = 1 + + # Cases where coordinates are reads as string data types instead of ints + try: + blockSizes_19 = chromEnd_19 - chromStart_19 + except: + blockSizes_19 = int(chromEnd_19) - int(chromStart_19) + + try: + blockSizes_38 = chromEnd_38 - chromStart_38 + except: + blockSizes_39 = int(chromEnd_38) - int(chromStart_38) + + blockStarts = 0 + geneSymbol = hgnc_symbol + + #------------------------------------------------------------------------------- + + temp19_list = [chromosome, chromStart_19, chromEnd_19, name, score, strand, + thickStart_19, thickEnd_19, rgb, blockCount, blockSizes_19, chromStart_19, geneSymbol, confidence_level, + entity_type, evidence, alias, ensembl_id_37, gene_name, hgnc_id, hgnc_symbol, omim_gene, + mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, + relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, + version, version_created, pathogenic_repeats, penetrance, phenotypes, + publications, repeated_sequence] + + try: + temp19_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp19_list] + except: + hg19_dict[string_dict_key] = temp19_list + + #------------------------------------------------------------------------------- + + temp38_list = [chromosome, chromStart_38, chromEnd_38, name, score, strand, + thickStart_38, thickEnd_38, rgb, blockCount, blockSizes_38, chromStart_38, geneSymbol, confidence_level, + entity_type, evidence, alias, ensembl_id_38, gene_name, hgnc_id, hgnc_symbol, omim_gene, + mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, + relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, + version, version_created, pathogenic_repeats, penetrance, phenotypes, + publications, repeated_sequence] + + try: + temp38_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp38_list] + except: + hg38_dict[string_dict_key] = temp38_list + + #------------------------------------------------------------------------------- + + continuous_count = continuous_count + 1 + count = count + 1 + + pd_19_table = pd.DataFrame.from_dict(hg19_dict) + pd_19_table = pd_19_table.T + pd_19_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', + 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', + 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', + 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', + 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', + 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', + 'phenotypes', 'publications', 'repeated_sequence'] + #pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) + #pd_19_table.to_csv('str_hg19.bed', sep='\t', index=False, header=None) + + pd_38_table = pd.DataFrame.from_dict(hg38_dict) + pd_38_table = pd_38_table.T + pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', + 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', + 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', + 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', + 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', + 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', + 'phenotypes', 'publications', 'repeated_sequence'] + #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) + #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None) + + return pd_19_table, pd_38_table + +def getPanelIds(url): + #logging.basicConfig(level=logging.DEBUG) + logging.basicConfig(level=logging.INFO) + logging.getLogger("urllib3").propagate = False + + logging.info("Downloading panel IDs") + panelIds = [] + + gotError = False + while not gotError: + logging.debug("Getting %s" % url) + myResponse = requests.get(url) + + jsonData = myResponse.content + #jsonFh.write(jsonData) + #jsonFh.write("\n".encode()) + data = json.loads(jsonData.decode()) + + for res in data["results"]: + panelIds.append(res["id"]) + + logging.debug("Total Panel IDs downloaded: %s" % len(panelIds)) + url = data["next"] + if url is None: + break + + return panelIds + +def downloadPanels(url): + panelIds = getPanelIds(url) + panelInfos = {} + + for panelId in panelIds: + if 'england' in url: + panelUrl = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId + elif 'aus' in url: + panelUrl = "https://panelapp-aus.org/api/v1/panels/%d/?format=json" % panelId + logging.debug("Getting %s" % panelUrl) + resp = requests.get(panelUrl) + res = resp.json() + + panelInfos[panelId] = res + + return panelInfos + +def getGeneSymbols(url): + try: + panelInfos = downloadPanels(url) + except requests.exceptions.JSONDecodeError: + time.sleep(30) + panelInfos = downloadPanels(url) + + syms = set() + for panelInfo in panelInfos.values(): + for gene in panelInfo["genes"]: + sym = gene["gene_data"]["gene_symbol"] + syms.add(sym) + assert(sym!="") + logging.info("Got %d gene symbols" % len(syms)) + return list(syms) + +def getGenesLocations(jsonFh,url,onlyPanels): + hg19_dict = dict() + hg38_dict = dict() + repeat19 = list() + repeat38 = list() + continuous_count = 0 + genes_missing_info = list() + genes_no_location = list() + + syms = getGeneSymbols(url) + + for sym in syms: + if 'england' in url: + entityUrl = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?entity_name={}&format=json".format(sym) + elif 'aus' in url: + entityUrl = "https://panelapp-aus.org/api/v1/genes/?entity_name={}&format=json".format(sym) + + count = 0 + while True: + try: + myResponse = requests.get(entityUrl) + if myResponse.ok: + break + else: + logging.error("Some error on %s, retrying after 1 minute (trial %d)" % (entityUrl, count)) + except Exception: + logging.error("HTTP error on %s, retrying after 1 minute (trial %d)" % (entityUrl, count)) + + time.sleep(60) # Wait 1 minute before trying again + count += 1 # Count the number of tries before failing + if count > 10: # Quit afer 10 failed attempts + assert False, "Cannot get URL after 10 attempts" + + jsonData = myResponse.content + #jData = myResponse.json() + jData = json.loads(jsonData.decode()) + + jsonFh.write(jsonData) + jsonFh.write("\n".encode()) + + res = jData['results'] + num_gene_variant = len(res) + count = 0 + while count != num_gene_variant: + temp_attribute_dictionary = dict() + string_dict_key = 'gene_{}'.format(continuous_count) + + gene_range_37 = None + gene_range_38 = None + + try: + ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] + location_37 = ensembl_genes_GRch37_82_location.split(':') + chromo_37 = 'chr'+location_37[0] + gene_range_37 = location_37[1].split('-') + # on hg19, we have added a chrMT sequence later. + except: + genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19") + + try: + ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location'] + location_38 = ensembl_genes_GRch38_90_location.split(':') + chromo_38 = 'chr'+location_38[0] + # Change mitochondrial chromosomal suffix from MT -> M for hg38 only + if chromo_38 == "chrMT": + chromo_38 = "chrM" + + gene_range_38 = location_38[1].split('-') + except: + genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") + + if gene_range_37 is None and gene_range_38 is None: + #print("gene without location on any assembly: %s" % res[count]) + genes_no_location.append(res[count]['gene_data']) + count+=1 + continue + + score = '0' + strand = '.' + blockCount = '1' + blockStarts = '0' + + #----------------------------------------------------------------------------------------------------------- + + gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] + for attribute in gene_data_list: + try: + temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute] + except: + temp_attribute_dictionary[attribute] = '' + + try: + temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene']) + except: + temp_attribute_dictionary['omim_gene'] = '' + + try: + temp_attribute_dictionary['gene_symbol'] = res[count]['gene_data']['gene_symbol'] + except: + temp_attribute_dictionary['gene_symbol'] = '' + + #----------------------------------------------------------------------------------------------------------- + # Need to split HGNC ID + try: + hgnc = res[count]['gene_data']['hgnc_id'] + temp_attribute_dictionary['hgnc_id'] = hgnc.split(':')[1] + except: + temp_attribute_dictionary['hgnc_id'] = '' + + #----------------------------------------------------------------------------------------------------------- + # Capitalize(title) gene name + + try: + gene_name = res[count]['gene_data']['gene_name'].title() + temp_attribute_dictionary['gene_name'] = gene_name + except: + temp_attribute_dictionary['gene_name'] = '' + #----------------------------------------------------------------------------------------------------------- + # Biotype change protein_coding to Protein Coding + + try: + biotype = res[count]['gene_data']['biotype'] + + if biotype == 'protein_coding': + biotype = 'Protein Coding' + + temp_attribute_dictionary['biotype'] = biotype + if biotype == None: + temp_attribute_dictionary['biotype'] = '' + except: + temp_attribute_dictionary['biotype'] = '' + + #----------------------------------------------------------------------------------------------------------- + + try: + ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id'] + ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id'] + + except: + ensembl_genes_GRch37_82_ensembl_id = '' + + #----------------------------------------------------------------------------------------------------------- + + gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags'] + + for attribute in gene_type_list: + try: + x = res[count][attribute] + if not x: + temp_attribute_dictionary[attribute] = '' + else: + pre = ' '.join(res[count][attribute]) + temp_attribute_dictionary[attribute] = pre.replace('\t', ' ') + except: + temp_attribute_dictionary[attribute] = '' + + #----------------------------------------------------------------------------------------------------------- + # Cannot exceed 255 characters + # Cannot have tabs + try: + x = res[count]['phenotypes'] + y = ' '.join(res[count]['phenotypes']) + + if not x: + temp_attribute_dictionary['phenotypes'] = '' + else: + temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ") + except: + temp_attribute_dictionary['phenotypes'] = '' + #----------------------------------------------------------------------------------------------------------- + # Evidence cannot exceed 255 characters + try: + x = res[count]['evidence'] + y = ' '.join(res[count]['evidence']) + if not x: + temp_attribute_dictionary['evidence'] = '' + else: + temp_attribute_dictionary['evidence'] = y + except: + temp_attribute_dictionary['evidence'] = '' + + #----------------------------------------------------------------------------------------------------------- + tags = ' '.join(res[count]['tags']).title() + try: + if not tags: + temp_attribute_dictionary['tags'] = '' + else: + temp_attribute_dictionary['tags'] = tags + except: + temp_attribute_dictionary['tags'] = '' + + #----------------------------------------------------------------------------------------------------------- + # Mode of Inheritance (fix format) + MOI = ' '.join(res[count]['mode_of_inheritance']).replace(" ", "???").replace(" ", "").replace("???", " ") + try: + if not MOI: + temp_attribute_dictionary['mode_of_inheritance'] = '' + else: + temp_attribute_dictionary['mode_of_inheritance'] = MOI + except: + temp_attribute_dictionary['mode_of_inheritance'] = '' + + #----------------------------------------------------------------------------------------------------------- + # For values with spaces + gene_type_list = ['entity_name', 'penetrance'] + + for attribute in gene_type_list: + try: + temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "") + except: + temp_attribute_dictionary[attribute] = '' + #----------------------------------------------------------------------------------------------------------- + # For values with spaces and need to be capitalized + attribute = 'entity_type' + + try: + temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize() + except: + temp_attribute_dictionary[attribute] = '' + #----------------------------------------------------------------------------------------------------------- + attribute = 'mode_of_pathogenicity' + + try: + mode = ' '.join(res[count][attribute]).replace(" ", "???").replace(" ", "").replace("???", " ") + if mode[0] == 'L' or mode[0] == 'l': + temp_attribute_dictionary[attribute] = 'Loss-of-function variants' + elif mode[0] == 'G' or mode[0] == 'g': + temp_attribute_dictionary[attribute] = 'Gain-of-function' + elif mode[0] == 'O' or mode[0] == 'o': + temp_attribute_dictionary[attribute] = 'Other' + else: + temp_attribute_dictionary[attribute] = mode + except: + temp_attribute_dictionary[attribute] = '' + + #----------------------------------------------------------------------------------------------------------- + + panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created'] + + for attribute in panel_list: + try: + x = res[count]['panel'][attribute] + if attribute=="id" and onlyPanels is not None: + if int(x) not in onlyPanels: + continue + if not x: + temp_attribute_dictionary[attribute] = '' + else: + temp_attribute_dictionary[attribute] = res[count]['panel'][attribute] + except: + temp_attribute_dictionary[attribute] = '' + + #----------------------------------------------------------------------------------------------------------- + + version_num = 0.0 + try: + version_num = float(res[count]['panel']['version']) + temp_attribute_dictionary['version'] = version_num + except: + temp_attribute_dictionary['version'] = '' + + #----------------------------------------------------------------------------------------------------------- + + try: + x = res[count]['panel']['relevant_disorders'] + y = ' '.join(res[count]['panel']['relevant_disorders']) + if not x: + temp_attribute_dictionary['relevant_disorders'] = '' + else: + temp_attribute_dictionary['relevant_disorders'] = y + except: + temp_attribute_dictionary['relevant_disorders'] = '' + + #----------------------------------------------------------------------------------------------------------- + # minimal effort to clean up the publication field, which is a mess of free form text + pubs = res[count]['publications'] + newPubs = [] + for pub in pubs: + pub = pub.replace("\n", "") + # replace commas with html commas as unfortunately I use commasin the browser to split fields + pub = pub.replace(",", ",") + # translate unicode chars to something the genome browser can display + pub = pub.encode('ascii', 'xmlcharrefreplace').decode("ascii") + if re.match("^[0-9]+$", pub): + #pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) + pub = "PMID"+pub + + newPubs.append(pub) + + temp_attribute_dictionary['publications'] = ", ".join(newPubs) + + #----------------------------------------------------------------------------------------------------------- + # MouseOverField + try: + mof = 'Gene: ' + temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';' + temp_attribute_dictionary['mouseOverField'] = mof + except: + temp_attribute_dictionary['mouseOverField'] = '' + + #----------------------------------------------------------------------------------------------------------- + # Column 4 + temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')' + #----------------------------------------------------------------------------------------------------------- + + #----------------------------------------------------------------------------------------------------------- + rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} + + # If the confidence level is set to 0, set to 1 + if temp_attribute_dictionary['confidence_level'] == '0': + temp_attribute_dictionary['confidence_level'] = '1' + + rgb = rgb_dict[temp_attribute_dictionary['confidence_level']] + rgb = rgb.strip('"') + + ''' + Replace all tab in value with spaces and removes new lines + ''' + for key, item in temp_attribute_dictionary.items(): + try: + if isinstance(item, int): + pass + elif isinstance(item, float): + pass + else: + temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r") + except: + pass + + # Version Threshold = 0.99 + max_num = float(0.99) + + if version_num > max_num: + if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None: # Removes Repeats + repeat19.append(temp_attribute_dictionary['label']) + blockSizes = int(gene_range_37[1]) - int(gene_range_37[0]) + hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], + score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, + temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], + temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, + temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], + temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], + temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], + temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], + temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], + temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] + + if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None: # Remove Repeats + repeat38.append(temp_attribute_dictionary['label']) + blockSizes = int(gene_range_38[1]) - int(gene_range_38[0]) + hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], + score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, + temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], + temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, + temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], + temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], + temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], + temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], + temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], + temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] + count = count + 1 + continuous_count = continuous_count + 1 + + print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') + print(genes_missing_info) + + print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):') + missSyms = [] + for miss in genes_no_location: + missSyms.append(miss["gene_symbol"]) + print(",".join(missSyms)) + + missOfh = open("missing_genes.txt", "w") + missOfh.write("* Not found in one assembly:\n") + missOfh.write("\n".join(genes_missing_info)) + missOfh.write("* No location at all:\n") + for miss in genes_no_location: + missOfh.write("\t"+str(miss)) + missOfh.write("\n") + missOfh.close() + + return(hg19_dict, hg38_dict) + +def downloadGenes(url, onlyPanels=None): + jsonFh = gzip.open("currentJson/genes.json.gz", "w") + + hg19_dict, hg38_dict = getGenesLocations(jsonFh,url, onlyPanels) + + jsonFh.close() + + pd_19_table = pd.DataFrame.from_dict(hg19_dict) + pd_38_table = pd.DataFrame.from_dict(hg38_dict) + pd_19_table = pd_19_table.T + pd_38_table = pd_38_table.T + pd_19_table.columns = ["chrom", "chromStart", + "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", + "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", + "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", + "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", + "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", + "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] + pd_38_table.columns = ["chrom", "chromStart", + "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", + "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", + "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", + "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", + "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", + "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] + + return pd_19_table, pd_38_table + + +def main(): + " create the 2 x three BED files and convert each to bigBed and update the archive " + + # the script uses relative pathnames, so make sure we're always in the right directory + os.chdir("/hive/data/outside/otto/panelApp") + + # First update panelApp England + # Gene panels track + hg19Bed, hg38Bed = downloadGenes("https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json", onlyPanels=[137, 126]) + writeBb(hg19Bed, hg38Bed, "genes") + # STRs track + hg19Bed, hg38Bed = downloadTandReps("https://panelapp.genomicsengland.co.uk/api/v1/strs/?format=json") + writeBb(hg19Bed, hg38Bed, "tandRep") + # CNV track + hg38Bed = downloadCnvs('https://panelapp.genomicsengland.co.uk/api/v1/regions/?format=json') + # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025 + writeBb(None, hg38Bed, "cnv") + + flipFiles('England') + updateGbdbSymlinks('England') + + # Now update panelApp Australia + # Genes track + hg19Bed, hg38Bed = downloadGenes("https://panelapp-aus.org/api/v1/panels/?format=json") + writeBb(hg19Bed, hg38Bed, "genesAus") + # STRs track + hg19Bed, hg38Bed = downloadTandReps("https://panelapp-aus.org/api/v1/strs/?format=json") + writeBb(hg19Bed, hg38Bed, "tandRepAus") + # CNVs track + hg38Bed = downloadCnvs('https://panelapp-aus.org/api/v1/regions/?format=json') + # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025 + writeBb(None, hg38Bed, "cnvAus") + + flipFiles('Australia') + updateGbdbSymlinks('Australia') + + print("PanelApp otto update: OK") + +main()