d6b6234e8be94844c6bea9f7d2ca73226dcd0c70 max Tue Sep 3 11:31:25 2024 -0700 small otto fix for panelapp, no redmine, monthly otto email diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py index 003ab9e..b99245d 100755 --- src/hg/utils/otto/panelApp/genes.py +++ src/hg/utils/otto/panelApp/genes.py @@ -1,499 +1,501 @@ import os import requests import time import json import pandas as pd import sys import argparse import re import gzip import logging ''' download panelApp data via its API (somewhat slow) ''' # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py # Written by a project student, Beagan, in 2020/2021, fixed up by Max # set to True for debugging onlyOne = False def getPanelIds(): #logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.INFO) logging.getLogger("urllib3").propagate = False logging.info("Downloading panel IDs") panelIds = [] gotError = False url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json" while not gotError: logging.debug("Getting %s" % url) myResponse = requests.get(url) jsonData = myResponse.content #jsonFh.write(jsonData) #jsonFh.write("\n".encode()) data = json.loads(jsonData.decode()) for res in data["results"]: panelIds.append(res["id"]) logging.debug("Total Panel IDs downloaded: %s" % len(panelIds)) url = data["next"] if url is None: break if onlyOne: break return panelIds def downloadPanels(): panelIds = getPanelIds() panelInfos = {} for panelId in panelIds: url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId logging.debug("Getting %s" % url) resp = requests.get(url) res = resp.json() panelInfos[panelId] = res if onlyOne: break return panelInfos def getGeneSymbols(): try: panelInfos = downloadPanels() except requests.exceptions.JSONDecodeError: time.sleep(30) panelInfos = downloadPanels() syms = set() for panelInfo in panelInfos.values(): for gene in panelInfo["genes"]: sym = gene["gene_data"]["gene_symbol"] syms.add(sym) assert(sym!="") logging.info("Got %d gene symbols" % len(syms)) return list(syms) def getGenesLocations(jsonFh): hg19_dict = dict() hg38_dict = dict() repeat19 = list() repeat38 = list() continuous_count = 0 genes_missing_info = list() genes_no_location = list() syms = getGeneSymbols() for sym in syms: url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/{}?format=json".format(sym) count = 0 while True: try: myResponse = requests.get(url) + if myResponse.ok: break + else: + logging.error("Some error on %s, retrying after 1 minute (trial %d)" % (url, count)) + time.sleep(60) + except: logging.error("HTTP error on %s, retrying after 1 minute (trial %d)" % (url, count)) time.sleep(60) count += 1 if count > 10: assert(False) # cannot get URL - if not (myResponse.ok): - assert(False) - jsonData = myResponse.content #jData = myResponse.json() jData = json.loads(jsonData.decode()) jsonFh.write(jsonData) jsonFh.write("\n".encode()) res = jData['results'] num_gene_variant = len(res) count = 0 while count != num_gene_variant: temp_attribute_dictionary = dict() string_dict_key = 'gene_{}'.format(continuous_count) gene_range_37 = None gene_range_38 = None try: ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] location_37 = ensembl_genes_GRch37_82_location.split(':') chromo_37 = 'chr'+location_37[0] gene_range_37 = location_37[1].split('-') # on hg19, we have added a chrMT sequence later. except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19") try: ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location'] location_38 = ensembl_genes_GRch38_90_location.split(':') chromo_38 = 'chr'+location_38[0] # Change mitochondrial chromosomal suffix from MT -> M for hg38 only if chromo_38 == "chrMT": chromo_38 = "chrM" gene_range_38 = location_38[1].split('-') except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") if gene_range_37 is None and gene_range_38 is None: #print("gene without location on any assembly: %s" % res[count]) genes_no_location.append(res[count]['gene_data']) count+=1 continue score = '0' strand = '.' blockCount = '1' blockStarts = '0' #----------------------------------------------------------------------------------------------------------- gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] for attribute in gene_data_list: try: temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute] except: temp_attribute_dictionary[attribute] = '' try: temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene']) except: temp_attribute_dictionary['omim_gene'] = '' try: temp_attribute_dictionary['gene_symbol'] = res[count]['gene_data']['gene_symbol'] except: temp_attribute_dictionary['gene_symbol'] = '' #----------------------------------------------------------------------------------------------------------- # Need to split HGNC ID try: hgnc = res[count]['gene_data']['hgnc_id'] temp_attribute_dictionary['hgnc_id'] = hgnc.split(':')[1] except: temp_attribute_dictionary['hgnc_id'] = '' #----------------------------------------------------------------------------------------------------------- # Capitalize(title) gene name try: gene_name = res[count]['gene_data']['gene_name'].title() temp_attribute_dictionary['gene_name'] = gene_name except: temp_attribute_dictionary['gene_name'] = '' #----------------------------------------------------------------------------------------------------------- # Biotype change protein_coding to Protein Coding try: biotype = res[count]['gene_data']['biotype'] if biotype == 'protein_coding': biotype = 'Protein Coding' temp_attribute_dictionary['biotype'] = biotype if biotype == None: temp_attribute_dictionary['biotype'] = '' except: temp_attribute_dictionary['biotype'] = '' #----------------------------------------------------------------------------------------------------------- try: ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id'] ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id'] except: ensembl_genes_GRch37_82_ensembl_id = '' #----------------------------------------------------------------------------------------------------------- gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags'] for attribute in gene_type_list: try: x = res[count][attribute] if not x: temp_attribute_dictionary[attribute] = '' else: pre = ' '.join(res[count][attribute]) temp_attribute_dictionary[attribute] = pre.replace('\t', ' ') except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- # Cannot exceed 255 characters # Cannot have tabs try: x = res[count]['phenotypes'] y = ' '.join(res[count]['phenotypes']) if not x: temp_attribute_dictionary['phenotypes'] = '' else: temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ") except: temp_attribute_dictionary['phenotypes'] = '' #----------------------------------------------------------------------------------------------------------- # Evidence cannot exceed 255 characters try: x = res[count]['evidence'] y = ' '.join(res[count]['evidence']) if not x: temp_attribute_dictionary['evidence'] = '' else: temp_attribute_dictionary['evidence'] = y except: temp_attribute_dictionary['evidence'] = '' #----------------------------------------------------------------------------------------------------------- tags = ' '.join(res[count]['tags']).title() try: if not tags: temp_attribute_dictionary['tags'] = '' else: temp_attribute_dictionary['tags'] = tags except: temp_attribute_dictionary['tags'] = '' #----------------------------------------------------------------------------------------------------------- # Mode of Inheritance (fix format) MOI = ' '.join(res[count]['mode_of_inheritance']).replace(" ", "???").replace(" ", "").replace("???", " ") try: if not MOI: temp_attribute_dictionary['mode_of_inheritance'] = '' else: temp_attribute_dictionary['mode_of_inheritance'] = MOI except: temp_attribute_dictionary['mode_of_inheritance'] = '' #----------------------------------------------------------------------------------------------------------- # For values with spaces gene_type_list = ['entity_name', 'penetrance'] for attribute in gene_type_list: try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "") except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- # For values with spaces and need to be capitalized attribute = 'entity_type' try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize() except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- attribute = 'mode_of_pathogenicity' try: mode = ' '.join(res[count][attribute]).replace(" ", "???").replace(" ", "").replace("???", " ") if mode[0] == 'L' or mode[0] == 'l': temp_attribute_dictionary[attribute] = 'Loss-of-function variants' elif mode[0] == 'G' or mode[0] == 'g': temp_attribute_dictionary[attribute] = 'Gain-of-function' elif mode[0] == 'O' or mode[0] == 'o': temp_attribute_dictionary[attribute] = 'Other' else: temp_attribute_dictionary[attribute] = mode except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created'] for attribute in panel_list: try: x = res[count]['panel'][attribute] if not x: temp_attribute_dictionary[attribute] = '' else: temp_attribute_dictionary[attribute] = res[count]['panel'][attribute] except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- version_num = 0.0 try: version_num = float(res[count]['panel']['version']) temp_attribute_dictionary['version'] = version_num except: temp_attribute_dictionary['version'] = '' #----------------------------------------------------------------------------------------------------------- try: x = res[count]['panel']['relevant_disorders'] y = ' '.join(res[count]['panel']['relevant_disorders']) if not x: temp_attribute_dictionary['relevant_disorders'] = '' else: temp_attribute_dictionary['relevant_disorders'] = y except: temp_attribute_dictionary['relevant_disorders'] = '' #----------------------------------------------------------------------------------------------------------- # minimal effort to clean up the publication field, which is a mess of free form text pubs = res[count]['publications'] newPubs = [] for pub in pubs: pub = pub.replace("\n", "") # replace commas with html commas as unfortunately I use commasin the browser to split fields pub = pub.replace(",", ",") # translate unicode chars to something the genome browser can display pub = pub.encode('ascii', 'xmlcharrefreplace').decode("ascii") if re.match("^[0-9]+$", pub): #pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) pub = "PMID"+pub newPubs.append(pub) temp_attribute_dictionary['publications'] = ", ".join(newPubs) #----------------------------------------------------------------------------------------------------------- # MouseOverField try: mof = 'Gene: ' + temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';' temp_attribute_dictionary['mouseOverField'] = mof except: temp_attribute_dictionary['mouseOverField'] = '' #----------------------------------------------------------------------------------------------------------- # Column 4 temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')' #----------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------- rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} # If the confidence level is set to 0, set to 1 if temp_attribute_dictionary['confidence_level'] == '0': temp_attribute_dictionary['confidence_level'] = '1' rgb = rgb_dict[temp_attribute_dictionary['confidence_level']] rgb = rgb.strip('"') ''' Replace all tab in value with spaces and removes new lines ''' for key, item in temp_attribute_dictionary.items(): try: if isinstance(item, int): pass elif isinstance(item, float): pass else: temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r") except: pass # Version Threshold = 0.99 max_num = float(0.99) if version_num > max_num: if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None: # Removes Repeats repeat19.append(temp_attribute_dictionary['label']) blockSizes = int(gene_range_37[1]) - int(gene_range_37[0]) hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None: # Remove Repeats repeat38.append(temp_attribute_dictionary['label']) blockSizes = int(gene_range_38[1]) - int(gene_range_38[0]) hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] count = count + 1 continuous_count = continuous_count + 1 print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') print(genes_missing_info) print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):') missSyms = [] for miss in genes_no_location: missSyms.append(miss["gene_symbol"]) print(",".join(missSyms)) missOfh = open("missing_genes.txt", "w") missOfh.write("* Not found in one assembly:\n") missOfh.write("\n".join(genes_missing_info)) missOfh.write("* No location at all:\n") for miss in genes_no_location: missOfh.write("\t"+str(miss)) missOfh.write("\n") missOfh.close() return(hg19_dict, hg38_dict) def downloadGenes(): jsonFh = gzip.open("currentJson/genes.json.gz", "w") hg19_dict, hg38_dict = getGenesLocations(jsonFh) jsonFh.close() pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_19_table = pd_19_table.T pd_38_table = pd_38_table.T pd_19_table.columns = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] pd_38_table.columns = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] return pd_19_table, pd_38_table #pd_19_table.to_csv('hg19_header.tsv', sep='\t', index=False) #pd_38_table.to_csv('hg38_header.tsv', sep='\t', index=False) #pd_19_table.to_csv('hg19_noheadertem.tsv', sep='\t', index=False, header=None) #pd_38_table.to_csv('hg38_noheader.tsv', sep='\t', index=False, header=None) #/usr/local/apache/htdocs-hgdownload/goldenPath/archive/hg38/panelApp/ #if __name__ == "__main__": #main()