f94190412d16ea558ace1ec9ea175db39aaa104b max Fri Sep 30 08:40:43 2022 -0700 panelApp otto job, refs #25568 diff --git src/hg/utils/otto/panelApp/tandRep.py src/hg/utils/otto/panelApp/tandRep.py new file mode 100644 index 0000000..a38fa9f --- /dev/null +++ src/hg/utils/otto/panelApp/tandRep.py @@ -0,0 +1,223 @@ +import requests +import json +import pandas as pd +import sys +import argparse +import re + +def get_url(url): + page_count = 1 + jData = dict() + while True: + try: + new_url = "{}{}".format(url, page_count) + myResponse = requests.get(new_url) + if (myResponse.ok): + jData = json.loads(myResponse.content.decode()) + + # If jData is empty create, else append + if "error" in jData.keys(): + raise Exception("{} page count is missing.".format(page_count)) + else: + print('Json data retrieved') + break + except: + if page_count > 1: + print('Json data retrieved') + else: + print("Unable to request URL") + sys.exit() + break + print(page_count) + page_count += 1 + return jData + + +def downloadTandReps(): + Error = True + continuous_count=0 + url = "https://panelapp.genomicsengland.co.uk/api/v1/strs/?page=" + jData = get_url(url) + + res = jData['results'] + num_gene_data = len(res) + count = 0 + continuous_count = 0 + hg19_dict = dict() + hg38_dict = dict() + + while count != num_gene_data: + #if count == 10: + # break + string_dict_key = 'gene_{}'.format(continuous_count) + + temp_attribute_dictionary = dict() + chromosome = res[count]['chromosome'] + chromosome = 'chr{}'.format(chromosome) + confidence_level = res[count]['confidence_level'] + entity_name = res[count]['entity_name'] + entity_type = res[count]['entity_type'] + evidence = ' '.join(res[count]['evidence']) + + gene_data = res[count]['gene_data'] + alias = ' '.join(gene_data['alias']) + biotype = gene_data['biotype'] + ensembl_id_37 = gene_data['ensembl_genes']['GRch37']['82']['ensembl_id'] + #ensembl_location_37 = gene_data['ensembl_genes']['GRch37']['82']['location'] + ensembl_id_38 = gene_data['ensembl_genes']['GRch38']['90']['ensembl_id'] + #ensembl_location_38 = gene_data['ensembl_genes']['GRch38']['90']['location'] + gene_name = gene_data['gene_name'] + #gene_symbol = gene_data['gene_symbol'] + #hgnc_date_symbol_changed = gene_data['hgnc_date_symbol_changed'] + hgnc_id = gene_data['hgnc_id'] + hgnc_symbol = gene_data['hgnc_symbol'] + omim_gene = ' '.join(gene_data['omim_gene']) + grch37_coordinates = res[count]['grch37_coordinates'] + if grch37_coordinates == None: + coordinates = gene_data['ensembl_genes']['GRch37']['82']['location'] + location = coordinates.split(':') + grch37_coordinates = location[1].split('-') + chromStart_19 = int(grch37_coordinates[0]) + chromEnd_19 = int(grch37_coordinates[1]) + + # hg38 + grch38_coordinates = res[count]['grch38_coordinates'] + if grch38_coordinates == None: + coordinates = gene_data['ensembl_genes']['GRch38']['90']['location'] + location = coordinates.split(':') + grch38_coordinates = location[1].split('-') + + mode_of_inheritance = res[count]['mode_of_inheritance'] + normal_repeats = res[count]['normal_repeats'] + chromStart_38 = int(grch38_coordinates[0]) + chromEnd_38 = int(grch38_coordinates[1]) + + panel = res[count]['panel'] + disease_group = panel['disease_group'] + disease_sub_group = panel['disease_sub_group'] + hash_id = panel['hash_id'] + idd = panel['id'] + panel_name = panel['name'] + relevant_disorders = ' '.join(panel['relevant_disorders']) + #relevant_disorders = relevant_disorders[:240] + + stats = panel['stats'] + number_of_gene = stats['number_of_genes'] + number_of_regions = stats['number_of_regions'] + number_of_strs = stats['number_of_strs'] + + status = panel['status'] + #description = panel['types'][0]['description'][:240] + description = panel['types'][0]['description'] + version = panel['version'] + version_created = panel['version_created'] + pathogenic_repeats = res[count]['pathogenic_repeats'] + penetrance = res[count]['penetrance'] + phenotypes = ' '.join(res[count]['phenotypes']) + phenotypes_no_num = ''.join([i for i in phenotypes if not i.isdigit()]) + publications = ' '.join(res[count]['publications']) + repeated_sequence = res[count]['repeated_sequence'] + tags = ' '.join(res[count]['tags']) + + # Check to see if panel_name is not empty + if panel_name: + try: + panel_name = panel_name.split(' - ') + panel_name = panel_name[0] + name = '{} ({})'.format(hgnc_symbol, panel_name) + except: + name = '{} ({})'.format(hgnc_symbol, panel_name) + else: + name = hgnc_symbol + score = 0 + strand = '.' + thickStart_19 = chromStart_19 + thickEnd_19 = chromEnd_19 + thickStart_38 = chromStart_38 + thickEnd_38 = chromEnd_38 + + rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} + # If the confidence level is set to 0, set to 1 + if confidence_level == '0': + confidence_level = '1' + rgb = rgb_dict[confidence_level] + rgb = rgb.strip('"') + blockCount = 1 + + # Cases where coordinates are reads as string data types instead of ints + try: + blockSizes_19 = chromEnd_19 - chromStart_19 + except: + blockSizes_19 = int(chromEnd_19) - int(chromStart_19) + + try: + blockSizes_38 = chromEnd_38 - chromStart_38 + except: + blockSizes_39 = int(chromEnd_38) - int(chromStart_38) + + blockStarts = 0 + geneSymbol = hgnc_symbol + + #------------------------------------------------------------------------------- + + temp19_list = [chromosome, chromStart_19, chromEnd_19, name, score, strand, + thickStart_19, thickEnd_19, rgb, blockCount, blockSizes_19, chromStart_19, geneSymbol, confidence_level, + entity_type, evidence, alias, ensembl_id_37, gene_name, hgnc_id, hgnc_symbol, omim_gene, + mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, + relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, + version, version_created, pathogenic_repeats, penetrance, phenotypes, + publications, repeated_sequence] + + try: + temp19_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp19_list] + except: + hg19_dict[string_dict_key] = temp19_list + + #------------------------------------------------------------------------------- + + temp38_list = [chromosome, chromStart_38, chromEnd_38, name, score, strand, + thickStart_38, thickEnd_38, rgb, blockCount, blockSizes_38, chromStart_38, geneSymbol, confidence_level, + entity_type, evidence, alias, ensembl_id_38, gene_name, hgnc_id, hgnc_symbol, omim_gene, + mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, + relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, + version, version_created, pathogenic_repeats, penetrance, phenotypes, + publications, repeated_sequence] + + try: + temp38_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp38_list] + except: + hg38_dict[string_dict_key] = temp38_list + + #------------------------------------------------------------------------------- + + continuous_count = continuous_count + 1 + count = count + 1 + + pd_19_table = pd.DataFrame.from_dict(hg19_dict) + pd_19_table = pd_19_table.T + pd_19_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', + 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', + 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', + 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', + 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', + 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', + 'phenotypes', 'publications', 'repeated_sequence'] + #pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) + #pd_19_table.to_csv('str_hg19.bed', sep='\t', index=False, header=None) + + pd_38_table = pd.DataFrame.from_dict(hg38_dict) + pd_38_table = pd_38_table.T + pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', + 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', + 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', + 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', + 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', + 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', + 'phenotypes', 'publications', 'repeated_sequence'] + #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) + #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None) + + return pd_19_table, pd_38_table + +#if __name__ == "__main__": + #main()