src/hg/utils/otto/panelApp/tandRep.py da2a89e8c4b23469cb35ec86b066bca2c4504745

da2a89e8c4b23469cb35ec86b066bca2c4504745
lrnassar
  Wed Aug 7 15:39:34 2024 -0700
Adding a new check to panelApp otto that started failing, before all panelApp genes had an OMIM ID associated with it, so that field of the JSON file was a list with (usually) just one OMIM ID. But now there is a single gene with no associated ID so it returns None instead of a list. No RM.

diff --git src/hg/utils/otto/panelApp/tandRep.py src/hg/utils/otto/panelApp/tandRep.py
index a38fa9f..b51584f 100644
--- src/hg/utils/otto/panelApp/tandRep.py
+++ src/hg/utils/otto/panelApp/tandRep.py
@@ -1,223 +1,226 @@
 import requests
 import json 
 import pandas as pd 
 import sys 
 import argparse
 import re
 
 def get_url(url):
     page_count = 1
     jData = dict()
     while True:
         try:
             new_url = "{}{}".format(url, page_count)
             myResponse = requests.get(new_url)
             if (myResponse.ok):
                 jData = json.loads(myResponse.content.decode())
 
                 # If jData is empty create, else append
                 if "error" in jData.keys():
                     raise Exception("{} page count is missing.".format(page_count))
             else:
                 print('Json data retrieved')
                 break
         except:
             if page_count > 1:
                 print('Json data retrieved')
             else:
                 print("Unable to request URL")
                 sys.exit()
             break
         print(page_count)
         page_count += 1
     return jData
         
 
 def downloadTandReps():
     Error = True
     continuous_count=0
     url = "https://panelapp.genomicsengland.co.uk/api/v1/strs/?page="
     jData = get_url(url)
 
     res = jData['results']
     num_gene_data = len(res)
     count = 0
     continuous_count = 0
     hg19_dict = dict()
     hg38_dict = dict()
 
     while count != num_gene_data:
         #if count == 10:
         #    break
         string_dict_key = 'gene_{}'.format(continuous_count)
         
         temp_attribute_dictionary = dict()
         chromosome = res[count]['chromosome']    
         chromosome = 'chr{}'.format(chromosome)
         confidence_level = res[count]['confidence_level']
         entity_name = res[count]['entity_name']
         entity_type = res[count]['entity_type']
         evidence = ' '.join(res[count]['evidence'])
         
         gene_data = res[count]['gene_data']
         alias = ' '.join(gene_data['alias'])
         biotype = gene_data['biotype']
         ensembl_id_37 = gene_data['ensembl_genes']['GRch37']['82']['ensembl_id']
         #ensembl_location_37 = gene_data['ensembl_genes']['GRch37']['82']['location']
         ensembl_id_38 = gene_data['ensembl_genes']['GRch38']['90']['ensembl_id']
         #ensembl_location_38 = gene_data['ensembl_genes']['GRch38']['90']['location']
         gene_name = gene_data['gene_name']
         #gene_symbol = gene_data['gene_symbol']
         #hgnc_date_symbol_changed = gene_data['hgnc_date_symbol_changed']
         hgnc_id = gene_data['hgnc_id']
         hgnc_symbol = gene_data['hgnc_symbol']
+        if str(gene_data['omim_gene']) == "None":
+            omim_gene = "None"
+        else:
             omim_gene = ' '.join(gene_data['omim_gene'])
         grch37_coordinates = res[count]['grch37_coordinates']
         if grch37_coordinates == None:
             coordinates = gene_data['ensembl_genes']['GRch37']['82']['location'] 
             location = coordinates.split(':')
             grch37_coordinates = location[1].split('-')
         chromStart_19 = int(grch37_coordinates[0])
         chromEnd_19 = int(grch37_coordinates[1])
 
         # hg38
         grch38_coordinates = res[count]['grch38_coordinates']
         if grch38_coordinates == None:
             coordinates = gene_data['ensembl_genes']['GRch38']['90']['location'] 
             location = coordinates.split(':')
             grch38_coordinates = location[1].split('-')
 
         mode_of_inheritance = res[count]['mode_of_inheritance']
         normal_repeats = res[count]['normal_repeats']
         chromStart_38 = int(grch38_coordinates[0])
         chromEnd_38 = int(grch38_coordinates[1])
         
         panel = res[count]['panel']
         disease_group = panel['disease_group']
         disease_sub_group = panel['disease_sub_group']
         hash_id = panel['hash_id']
         idd = panel['id']
         panel_name = panel['name']
         relevant_disorders = ' '.join(panel['relevant_disorders'])
         #relevant_disorders = relevant_disorders[:240]
         
         stats = panel['stats']
         number_of_gene = stats['number_of_genes']
         number_of_regions = stats['number_of_regions']
         number_of_strs = stats['number_of_strs']
 
         status = panel['status']
         #description = panel['types'][0]['description'][:240]
         description = panel['types'][0]['description']
         version = panel['version']
         version_created = panel['version_created']
         pathogenic_repeats = res[count]['pathogenic_repeats']
         penetrance = res[count]['penetrance']
         phenotypes = ' '.join(res[count]['phenotypes'])
         phenotypes_no_num = ''.join([i for i in phenotypes if not i.isdigit()])
         publications = ' '.join(res[count]['publications'])
         repeated_sequence = res[count]['repeated_sequence']
         tags = ' '.join(res[count]['tags'])
 
         # Check to see if panel_name is not empty
         if panel_name:
             try:
                 panel_name = panel_name.split(' - ')
                 panel_name = panel_name[0]
                 name = '{} ({})'.format(hgnc_symbol, panel_name)
             except:
                 name = '{} ({})'.format(hgnc_symbol, panel_name)
         else:
             name = hgnc_symbol
         score = 0
         strand = '.'
         thickStart_19 = chromStart_19
         thickEnd_19 = chromEnd_19
         thickStart_38 = chromStart_38
         thickEnd_38 = chromEnd_38
 
         rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'}
         # If the confidence level is set to 0, set to 1
         if confidence_level == '0':
             confidence_level = '1'
         rgb = rgb_dict[confidence_level]
         rgb = rgb.strip('"')
         blockCount = 1
         
         # Cases where coordinates are reads as string data types instead of ints
         try:
             blockSizes_19 = chromEnd_19 - chromStart_19
         except:
             blockSizes_19 = int(chromEnd_19) - int(chromStart_19)
 
         try:
             blockSizes_38 = chromEnd_38 - chromStart_38
         except:
             blockSizes_39 = int(chromEnd_38) - int(chromStart_38)
 
         blockStarts = 0
         geneSymbol = hgnc_symbol
 
         #-------------------------------------------------------------------------------
 
         temp19_list = [chromosome, chromStart_19, chromEnd_19, name, score, strand,
         thickStart_19, thickEnd_19, rgb, blockCount, blockSizes_19, chromStart_19, geneSymbol, confidence_level, 
         entity_type, evidence, alias, ensembl_id_37, gene_name, hgnc_id, hgnc_symbol, omim_gene, 
         mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, 
         relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, 
         version, version_created, pathogenic_repeats, penetrance, phenotypes, 
         publications, repeated_sequence]
         
         try: 
             temp19_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp19_list]
         except:
             hg19_dict[string_dict_key] = temp19_list
 
         #-------------------------------------------------------------------------------
 
         temp38_list = [chromosome, chromStart_38, chromEnd_38, name, score, strand,
         thickStart_38, thickEnd_38, rgb, blockCount, blockSizes_38, chromStart_38, geneSymbol, confidence_level, 
         entity_type, evidence, alias, ensembl_id_38, gene_name, hgnc_id, hgnc_symbol, omim_gene, 
         mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, 
         relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, 
         version, version_created, pathogenic_repeats, penetrance, phenotypes, 
         publications, repeated_sequence]
         
         try: 
             temp38_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp38_list]
         except:
             hg38_dict[string_dict_key] = temp38_list
 
         #-------------------------------------------------------------------------------
     
         continuous_count = continuous_count + 1
         count = count + 1
 
     pd_19_table = pd.DataFrame.from_dict(hg19_dict)
     pd_19_table = pd_19_table.T
     pd_19_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
         'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 
         'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 
         'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 
         'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 
         'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 
         'phenotypes', 'publications', 'repeated_sequence']
     #pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True))
     #pd_19_table.to_csv('str_hg19.bed', sep='\t', index=False, header=None)
 
     pd_38_table = pd.DataFrame.from_dict(hg38_dict)
     pd_38_table = pd_38_table.T
     pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
         'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 
         'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 
         'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 
         'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 
         'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 
         'phenotypes', 'publications', 'repeated_sequence']
     #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True))
     #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None)
 
     return pd_19_table, pd_38_table
 
 #if __name__ == "__main__":
     #main()