6a1db42395f283de1421bd45e9d245ba2f896005 max Wed Dec 7 03:38:42 2022 -0800 commit panelApp otto changes, refs #30212 diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py index 2e2d2db..0b94665 100755 --- src/hg/utils/otto/panelApp/genes.py +++ src/hg/utils/otto/panelApp/genes.py @@ -1,57 +1,117 @@ import os import requests import json import pandas as pd import sys import argparse import re import gzip +import logging ''' download panelApp data via its API (somewhat slow) ''' # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py -# Written by a project student, Beagan, in 2020/2021 +# Written by a project student, Beagan, in 2020/2021, fixed up by Max + +# set to True for debugging +onlyOne = False + +def getPanelIds(): + #logging.basicConfig(level=logging.DEBUG) + logging.basicConfig(level=logging.INFO) + logging.getLogger("urllib3").propagate = False + + logging.info("Downloading panel IDs") + panelIds = [] + + gotError = False + url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json" + while not gotError: + logging.debug("Getting %s" % url) + myResponse = requests.get(url) + + jsonData = myResponse.content + #jsonFh.write(jsonData) + #jsonFh.write("\n".encode()) + data = json.loads(jsonData.decode()) + + for res in data["results"]: + panelIds.append(res["id"]) + + logging.debug("Total Panel IDs downloaded: %s" % len(panelIds)) + url = data["next"] + if url is None: + break + if onlyOne: + break + + return panelIds + +def downloadPanels(): + panelIds = getPanelIds() + panelInfos = {} + + for panelId in panelIds: + url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId + logging.debug("Getting %s" % url) + resp = requests.get(url) + res = resp.json() + panelInfos[panelId] = res + if onlyOne: + break + + return panelInfos + +def getGeneSymbols(): + panelInfos = downloadPanels() + syms = set() + for panelInfo in panelInfos.values(): + for gene in panelInfo["genes"]: + sym = gene["gene_data"]["gene_symbol"] + syms.add(sym) + assert(sym!="") + logging.info("Got %d gene symbols" % len(syms)) + return list(syms) def getGenesLocations(jsonFh): - page_count = 1 - Error = True hg19_dict = dict() hg38_dict = dict() repeat19 = list() repeat38 = list() continuous_count = 0 genes_missing_info = list() genes_no_location = list() - while Error: - url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count) + syms = getGeneSymbols() + + for sym in syms: + url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/{}?format=json".format(sym) myResponse = requests.get(url) - if (myResponse.ok): + if not (myResponse.ok): + assert(False) + jsonData = myResponse.content + #jData = myResponse.json() + jData = json.loads(jsonData.decode()) jsonFh.write(jsonData) jsonFh.write("\n".encode()) - jData = json.loads(jsonData.decode()) - - if "error" in jData.keys(): - raise Exception("{} page count is missing.".format(page_count)) - res = jData['results'] num_gene_variant = len(res) count = 0 while count != num_gene_variant: temp_attribute_dictionary = dict() string_dict_key = 'gene_{}'.format(continuous_count) gene_range_37 = None gene_range_38 = None try: ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] location_37 = ensembl_genes_GRch37_82_location.split(':') chromo_37 = 'chr'+location_37[0] gene_range_37 = location_37[1].split('-') @@ -65,32 +125,30 @@ chromo_38 = 'chr'+location_38[0] # Change mitochondrial chromosomal suffix from MT -> M for hg38 only if chromo_38 == "chrMT": chromo_38 = "chrM" gene_range_38 = location_38[1].split('-') except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") if gene_range_37 is None and gene_range_38 is None: #print("gene without location on any assembly: %s" % res[count]) genes_no_location.append(res[count]['gene_data']) count+=1 continue - - score = '0' strand = '.' blockCount = '1' blockStarts = '0' #----------------------------------------------------------------------------------------------------------- gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] for attribute in gene_data_list: try: temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute] except: temp_attribute_dictionary[attribute] = '' try: @@ -159,86 +217,80 @@ temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- # Cannot exceed 255 characters # Cannot have tabs try: x = res[count]['phenotypes'] y = ' '.join(res[count]['phenotypes']) if not x: temp_attribute_dictionary['phenotypes'] = '' else: temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ") except: temp_attribute_dictionary['phenotypes'] = '' - #----------------------------------------------------------------------------------------------------------- # Evidence cannot exceed 255 characters try: x = res[count]['evidence'] y = ' '.join(res[count]['evidence']) if not x: temp_attribute_dictionary['evidence'] = '' else: temp_attribute_dictionary['evidence'] = y except: temp_attribute_dictionary['evidence'] = '' #----------------------------------------------------------------------------------------------------------- - tags = ' '.join(res[count]['tags']).title() try: if not tags: temp_attribute_dictionary['tags'] = '' else: temp_attribute_dictionary['tags'] = tags except: temp_attribute_dictionary['tags'] = '' #----------------------------------------------------------------------------------------------------------- # Mode of Inheritance (fix format) - MOI = ' '.join(res[count]['mode_of_inheritance']).replace(" ", "???").replace(" ", "").replace("???", " ") try: if not MOI: temp_attribute_dictionary['mode_of_inheritance'] = '' else: temp_attribute_dictionary['mode_of_inheritance'] = MOI except: temp_attribute_dictionary['mode_of_inheritance'] = '' #----------------------------------------------------------------------------------------------------------- # For values with spaces - gene_type_list = ['entity_name', 'penetrance'] for attribute in gene_type_list: try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "") except: temp_attribute_dictionary[attribute] = '' - #----------------------------------------------------------------------------------------------------------- # For values with spaces and need to be capitalized attribute = 'entity_type' try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize() except: temp_attribute_dictionary[attribute] = '' - #----------------------------------------------------------------------------------------------------------- attribute = 'mode_of_pathogenicity' try: mode = ' '.join(res[count][attribute]).replace(" ", "???").replace(" ", "").replace("???", " ") if mode[0] == 'L' or mode[0] == 'l': temp_attribute_dictionary[attribute] = 'Loss-of-function variants' elif mode[0] == 'G' or mode[0] == 'g': temp_attribute_dictionary[attribute] = 'Gain-of-function' elif mode[0] == 'O' or mode[0] == 'o': temp_attribute_dictionary[attribute] = 'Other' else: temp_attribute_dictionary[attribute] = mode except: temp_attribute_dictionary[attribute] = '' @@ -355,45 +407,39 @@ repeat38.append(temp_attribute_dictionary['label']) blockSizes = int(gene_range_38[1]) - int(gene_range_38[0]) hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] count = count + 1 continuous_count = continuous_count + 1 - else: - Error = False # End of all pages - - page_count = page_count + 1 - print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') print(genes_missing_info) print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):') missSyms = [] for miss in genes_no_location: missSyms.append(miss["gene_symbol"]) print(",".join(missSyms)) - missOfh = open("missing_genes.txt", "w") missOfh.write("* Not found in one assembly:\n") missOfh.write("\n".join(genes_missing_info)) missOfh.write("* No location at all:\n") for miss in genes_no_location: missOfh.write("\t"+str(miss)) missOfh.write("\n") missOfh.close() return(hg19_dict, hg38_dict) def downloadGenes(): jsonFh = gzip.open("currentJson/genes.json.gz", "w") hg19_dict, hg38_dict = getGenesLocations(jsonFh)