da2a89e8c4b23469cb35ec86b066bca2c4504745 lrnassar Wed Aug 7 15:39:34 2024 -0700 Adding a new check to panelApp otto that started failing, before all panelApp genes had an OMIM ID associated with it, so that field of the JSON file was a list with (usually) just one OMIM ID. But now there is a single gene with no associated ID so it returns None instead of a list. No RM. diff --git src/hg/utils/otto/panelApp/tandRep.py src/hg/utils/otto/panelApp/tandRep.py index a38fa9f..b51584f 100644 --- src/hg/utils/otto/panelApp/tandRep.py +++ src/hg/utils/otto/panelApp/tandRep.py @@ -1,223 +1,226 @@ import requests import json import pandas as pd import sys import argparse import re def get_url(url): page_count = 1 jData = dict() while True: try: new_url = "{}{}".format(url, page_count) myResponse = requests.get(new_url) if (myResponse.ok): jData = json.loads(myResponse.content.decode()) # If jData is empty create, else append if "error" in jData.keys(): raise Exception("{} page count is missing.".format(page_count)) else: print('Json data retrieved') break except: if page_count > 1: print('Json data retrieved') else: print("Unable to request URL") sys.exit() break print(page_count) page_count += 1 return jData def downloadTandReps(): Error = True continuous_count=0 url = "https://panelapp.genomicsengland.co.uk/api/v1/strs/?page=" jData = get_url(url) res = jData['results'] num_gene_data = len(res) count = 0 continuous_count = 0 hg19_dict = dict() hg38_dict = dict() while count != num_gene_data: #if count == 10: # break string_dict_key = 'gene_{}'.format(continuous_count) temp_attribute_dictionary = dict() chromosome = res[count]['chromosome'] chromosome = 'chr{}'.format(chromosome) confidence_level = res[count]['confidence_level'] entity_name = res[count]['entity_name'] entity_type = res[count]['entity_type'] evidence = ' '.join(res[count]['evidence']) gene_data = res[count]['gene_data'] alias = ' '.join(gene_data['alias']) biotype = gene_data['biotype'] ensembl_id_37 = gene_data['ensembl_genes']['GRch37']['82']['ensembl_id'] #ensembl_location_37 = gene_data['ensembl_genes']['GRch37']['82']['location'] ensembl_id_38 = gene_data['ensembl_genes']['GRch38']['90']['ensembl_id'] #ensembl_location_38 = gene_data['ensembl_genes']['GRch38']['90']['location'] gene_name = gene_data['gene_name'] #gene_symbol = gene_data['gene_symbol'] #hgnc_date_symbol_changed = gene_data['hgnc_date_symbol_changed'] hgnc_id = gene_data['hgnc_id'] hgnc_symbol = gene_data['hgnc_symbol'] + if str(gene_data['omim_gene']) == "None": + omim_gene = "None" + else: omim_gene = ' '.join(gene_data['omim_gene']) grch37_coordinates = res[count]['grch37_coordinates'] if grch37_coordinates == None: coordinates = gene_data['ensembl_genes']['GRch37']['82']['location'] location = coordinates.split(':') grch37_coordinates = location[1].split('-') chromStart_19 = int(grch37_coordinates[0]) chromEnd_19 = int(grch37_coordinates[1]) # hg38 grch38_coordinates = res[count]['grch38_coordinates'] if grch38_coordinates == None: coordinates = gene_data['ensembl_genes']['GRch38']['90']['location'] location = coordinates.split(':') grch38_coordinates = location[1].split('-') mode_of_inheritance = res[count]['mode_of_inheritance'] normal_repeats = res[count]['normal_repeats'] chromStart_38 = int(grch38_coordinates[0]) chromEnd_38 = int(grch38_coordinates[1]) panel = res[count]['panel'] disease_group = panel['disease_group'] disease_sub_group = panel['disease_sub_group'] hash_id = panel['hash_id'] idd = panel['id'] panel_name = panel['name'] relevant_disorders = ' '.join(panel['relevant_disorders']) #relevant_disorders = relevant_disorders[:240] stats = panel['stats'] number_of_gene = stats['number_of_genes'] number_of_regions = stats['number_of_regions'] number_of_strs = stats['number_of_strs'] status = panel['status'] #description = panel['types'][0]['description'][:240] description = panel['types'][0]['description'] version = panel['version'] version_created = panel['version_created'] pathogenic_repeats = res[count]['pathogenic_repeats'] penetrance = res[count]['penetrance'] phenotypes = ' '.join(res[count]['phenotypes']) phenotypes_no_num = ''.join([i for i in phenotypes if not i.isdigit()]) publications = ' '.join(res[count]['publications']) repeated_sequence = res[count]['repeated_sequence'] tags = ' '.join(res[count]['tags']) # Check to see if panel_name is not empty if panel_name: try: panel_name = panel_name.split(' - ') panel_name = panel_name[0] name = '{} ({})'.format(hgnc_symbol, panel_name) except: name = '{} ({})'.format(hgnc_symbol, panel_name) else: name = hgnc_symbol score = 0 strand = '.' thickStart_19 = chromStart_19 thickEnd_19 = chromEnd_19 thickStart_38 = chromStart_38 thickEnd_38 = chromEnd_38 rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} # If the confidence level is set to 0, set to 1 if confidence_level == '0': confidence_level = '1' rgb = rgb_dict[confidence_level] rgb = rgb.strip('"') blockCount = 1 # Cases where coordinates are reads as string data types instead of ints try: blockSizes_19 = chromEnd_19 - chromStart_19 except: blockSizes_19 = int(chromEnd_19) - int(chromStart_19) try: blockSizes_38 = chromEnd_38 - chromStart_38 except: blockSizes_39 = int(chromEnd_38) - int(chromStart_38) blockStarts = 0 geneSymbol = hgnc_symbol #------------------------------------------------------------------------------- temp19_list = [chromosome, chromStart_19, chromEnd_19, name, score, strand, thickStart_19, thickEnd_19, rgb, blockCount, blockSizes_19, chromStart_19, geneSymbol, confidence_level, entity_type, evidence, alias, ensembl_id_37, gene_name, hgnc_id, hgnc_symbol, omim_gene, mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, version, version_created, pathogenic_repeats, penetrance, phenotypes, publications, repeated_sequence] try: temp19_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp19_list] except: hg19_dict[string_dict_key] = temp19_list #------------------------------------------------------------------------------- temp38_list = [chromosome, chromStart_38, chromEnd_38, name, score, strand, thickStart_38, thickEnd_38, rgb, blockCount, blockSizes_38, chromStart_38, geneSymbol, confidence_level, entity_type, evidence, alias, ensembl_id_38, gene_name, hgnc_id, hgnc_symbol, omim_gene, mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, version, version_created, pathogenic_repeats, penetrance, phenotypes, publications, repeated_sequence] try: temp38_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp38_list] except: hg38_dict[string_dict_key] = temp38_list #------------------------------------------------------------------------------- continuous_count = continuous_count + 1 count = count + 1 pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_19_table = pd_19_table.T pd_19_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 'phenotypes', 'publications', 'repeated_sequence'] #pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) #pd_19_table.to_csv('str_hg19.bed', sep='\t', index=False, header=None) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_38_table = pd_38_table.T pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 'phenotypes', 'publications', 'repeated_sequence'] #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None) return pd_19_table, pd_38_table #if __name__ == "__main__": #main()