f94190412d16ea558ace1ec9ea175db39aaa104b max Fri Sep 30 08:40:43 2022 -0700 panelApp otto job, refs #25568 diff --git src/hg/utils/otto/panelApp/cnv.py src/hg/utils/otto/panelApp/cnv.py new file mode 100644 index 0000000..75a3304 --- /dev/null +++ src/hg/utils/otto/panelApp/cnv.py @@ -0,0 +1,200 @@ +import requests +import json +import pandas as pd +import sys +import argparse +import re + +def get_url(url): + page_count = 1 + jData = dict() + while True: + try: + new_url = "{}{}".format(url, page_count) + myResponse = requests.get(new_url) + if (myResponse.ok): + jData = json.loads(myResponse.content.decode()) + + # If jData is empty create, else append + if "error" in jData.keys(): + raise Exception("{} page count is missing.".format(page_count)) + else: + print('Json data retrieved') + break + except: + if page_count > 1: + print('Json data retrieved') + else: + print("Unable to request URL") + sys.exit() + break + print(page_count) + page_count += 1 + return jData + +def downloadCnvs(): + Error = True + continuous_count=0 + url = "https://panelapp.genomicsengland.co.uk/api/v1/regions/?page=" + jData = get_url(url) + + res = jData['results'] + num_gene_data = len(res) + count = 0 + continuous_count = 0 + hg19_dict = dict() + hg38_dict = dict() + + while count != num_gene_data: + temp_attribute_dictionary = dict() + string_dict_key = 'gene_{}'.format(continuous_count) + + chromo = res[count]['chromosome'] + chromosome = 'chr' + chromo + + start_coordinates = res[count]['grch38_coordinates'][0] + end_coordinates = res[count]['grch38_coordinates'][1] + + score = '0' + strand = '.' + thickStart = start_coordinates + thickEnd = end_coordinates + blockCount = '1' + blockSizes = int(end_coordinates) - int(start_coordinates) + blockStarts = 0 + + confidence_level = res[count]['confidence_level'] + + rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} + itemRgb = rgb_dict[confidence_level] + + entity_name = res[count]['entity_name'] + entity_type = res[count]['entity_type'] + evidence = ' '.join(res[count]['evidence']) + + haploinsufficiency_score = res[count]['haploinsufficiency_score'] + if not haploinsufficiency_score: + haploinsufficiency_score = '' + + moi = res[count]['mode_of_inheritance'] + if not moi: + moi = '' + + disease_group = res[count]['panel']['disease_group'] + if not disease_group: + disease_group = '' + + disease_sub_group = res[count]['panel']['disease_sub_group'] + if not disease_sub_group: + disease_sub_group = '' + + # idd = Panel ID + idd = res[count]['panel']['id'] + if not idd: + idd = '' + + panel_name = res[count]['panel']['name'] + if not panel_name: + panel_name = '' + + relevant_disorders = ' '.join(res[count]['panel']['relevant_disorders']) + if not relevant_disorders: + relevant_disorders = '' + + status = res[count]['panel']['status'] + if not status: + status = '' + + ''' + types = res[count]['panel']['types'] + types = str(types).replace("{","").replace("}", "").replace("'", "") + if not types: + types = '' + types = types[1:-1] + ''' + + types = res[count]['panel']['types'][0]['name'] + + version = res[count]['panel']['version'] + if float(version) < 0.99: + continue + if not version: + version = '' + + penetrance = res[count]['penetrance'] + if not penetrance: + penetrance = '' + + phenotypes = ' '.join(res[count]['phenotypes']) + if not phenotypes: + phenotypes = '' + + publications = ' '.join(res[count]['publications']) + if not publications: + publications = '' + + #required_overlap_percentage = res[count]['required_overlap_percentage'] + tags = res[count]['tags'] + if not tags: + tags = '' + + triplosensitivity_score = res[count]['triplosensitivity_score'] + if not triplosensitivity_score: + triplosensitivity_score = '' + + type_of_variants = res[count]['type_of_variants'] + if not type_of_variants: + type_of_variants = '' + + verbose_name = res[count]['verbose_name'] + if not verbose_name: + verbose_name = '' + + # Mouse Over Field + mouseOverField = "" + try: + mof = 'Gene: ' + entity_name + ';' + ' Panel: ' + name + ';' + ' MOI: ' + moi + ';' + ' Phenotypes: ' + phenotypes + ';' + ' Confidence: ' + confidence_level + ';' + mouseOverField = mof + except: + mouseOverField = '' + + # name + name = '{} ({})'.format(entity_name, panel_name) + + hg38_dict[string_dict_key] = [chromosome, start_coordinates, end_coordinates, name, score, strand, + thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts, confidence_level, + panel_name, idd, entity_name, entity_type, evidence, haploinsufficiency_score, moi, disease_group, + disease_sub_group, relevant_disorders, status, types, version, penetrance, phenotypes, + publications, triplosensitivity_score, type_of_variants, verbose_name, mouseOverField] + + #------------------------------------------------------------------------------- + + continuous_count = continuous_count + 1 + count = count + 1 + + # Removes new lines + for key, item in hg38_dict.items(): + strip_list = list() + for i in item: + try: + strip_list.append(i.replace('\t', ' ').strip().strip("\n").strip("\r")) + except: + strip_list.append(i) + hg38_dict[key] = strip_list + + pd_38_table = pd.DataFrame.from_dict(hg38_dict) + pd_38_table = pd_38_table.T + pd_38_table.columns = ['chrom', 'chromStart', 'End', 'name', 'Score', 'strand', 'thickStart', 'thickEnd', + 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'Confidence Level', + 'Panel Name', 'Panel ID', 'Entity Name', 'Entity Type', 'Evidence', 'ClinGen Haploinsufficiency Score', + 'Mode of Inheritance', 'Disease Group', 'Disease Sub Group', 'Relevant Disorders', + 'Status', 'Types', 'Version Created', 'Penetrance', 'Phenotypes', 'Publications', + 'CinGen Triplosensitivity Score', 'Type of Variants', 'Verbose Name', 'Mouse Over Field'] + #pd_38_table = pd_38_table.sort_values(by=['chromosome', 'Start'], ascending = (True, True)) + #pd_38_table.to_csv('hg38_region_noheader_sorted.tsv', sep='\t', index=False, header=None) + #pd_38_table.to_csv('hg38_region_header_sorted.tsv', sep='\t', index=False) + return pd_38_table + + +#if __name__ == "__main__": + #main()