src/hg/utils/otto/panelApp/genes.py 6a1db42395f283de1421bd45e9d245ba2f896005

6a1db42395f283de1421bd45e9d245ba2f896005
max
  Wed Dec 7 03:38:42 2022 -0800
commit panelApp otto changes, refs #30212

diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py
index 2e2d2db..0b94665 100755
--- src/hg/utils/otto/panelApp/genes.py
+++ src/hg/utils/otto/panelApp/genes.py
@@ -1,57 +1,117 @@
 import os
 import requests
 import json 
 import pandas as pd 
 import sys 
 import argparse
 import re
 import gzip
+import logging
 
 '''
 download panelApp data via its API (somewhat slow)
 '''
 
 # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py
-# Written by a project student, Beagan, in 2020/2021
+# Written by a project student, Beagan, in 2020/2021, fixed up by Max
+
+# set to True for debugging
+onlyOne = False
+
+def getPanelIds():
+    #logging.basicConfig(level=logging.DEBUG)
+    logging.basicConfig(level=logging.INFO)
+    logging.getLogger("urllib3").propagate = False
+
+    logging.info("Downloading panel IDs")
+    panelIds = []
+
+    gotError = False
+    url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json"
+    while not gotError:
+        logging.debug("Getting %s" % url)
+        myResponse = requests.get(url)
+
+        jsonData = myResponse.content
+        #jsonFh.write(jsonData)
+        #jsonFh.write("\n".encode())
+        data = json.loads(jsonData.decode())
+
+        for res in data["results"]:
+            panelIds.append(res["id"])
+
+        logging.debug("Total Panel IDs downloaded:  %s" % len(panelIds))
+        url = data["next"]
+        if url is None:
+            break
+        if onlyOne:
+            break
+
+    return panelIds
+
+def downloadPanels():
+    panelIds = getPanelIds()
+    panelInfos = {}
+
+    for panelId in panelIds:
+        url = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId
+        logging.debug("Getting %s" % url)
+        resp = requests.get(url)
+        res  = resp.json()
+        panelInfos[panelId] = res
+        if onlyOne:
+            break
+
+    return panelInfos
+
+def getGeneSymbols():
+    panelInfos = downloadPanels()
+    syms = set()
+    for panelInfo in panelInfos.values():
+        for gene in panelInfo["genes"]:
+            sym = gene["gene_data"]["gene_symbol"]
+            syms.add(sym)
+            assert(sym!="")
+    logging.info("Got %d gene symbols" % len(syms))
+    return list(syms)
 
 def getGenesLocations(jsonFh):
-    page_count = 1
-    Error = True
     hg19_dict = dict()
     hg38_dict = dict()
     repeat19 = list()
     repeat38 = list()
     continuous_count = 0
     genes_missing_info = list()
     genes_no_location = list()
 
-    while Error: 
-        url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count)
+    syms = getGeneSymbols()
+
+    for sym in syms:
+        url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/{}?format=json".format(sym)
         myResponse = requests.get(url)
 
-        if (myResponse.ok):
+        if not (myResponse.ok):
+            assert(False)
+
         jsonData = myResponse.content
+        #jData = myResponse.json()
+        jData = json.loads(jsonData.decode())
 
         jsonFh.write(jsonData)
         jsonFh.write("\n".encode())
 
-            jData = json.loads(jsonData.decode())
-
-            if "error" in jData.keys():
-                raise Exception("{} page count is missing.".format(page_count))
-            
         res = jData['results']
         num_gene_variant = len(res)
         count = 0
         while count != num_gene_variant:
             temp_attribute_dictionary = dict()
             string_dict_key = 'gene_{}'.format(continuous_count)
 
             gene_range_37 = None
             gene_range_38 = None
 
             try:
                 ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
                 location_37 = ensembl_genes_GRch37_82_location.split(':')
                 chromo_37 = 'chr'+location_37[0]
                 gene_range_37 = location_37[1].split('-')
@@ -65,32 +125,30 @@
                 chromo_38 = 'chr'+location_38[0]
                 # Change mitochondrial chromosomal suffix from MT -> M for hg38 only
                 if chromo_38 == "chrMT":
                     chromo_38 = "chrM"
 
                 gene_range_38 = location_38[1].split('-')
             except:
                 genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38")
 
             if gene_range_37 is None and gene_range_38 is None:
                 #print("gene without location on any assembly: %s" % res[count])
                 genes_no_location.append(res[count]['gene_data'])
                 count+=1
                 continue
 
-
-
             score = '0'
             strand = '.'
             blockCount = '1'
             blockStarts = '0'
 
             #-----------------------------------------------------------------------------------------------------------
 
             gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id']
             for attribute in gene_data_list:
                 try:
                     temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute]
                 except:
                     temp_attribute_dictionary[attribute] = ''
 
             try:
@@ -159,86 +217,80 @@
                     temp_attribute_dictionary[attribute] = ''
 
             #-----------------------------------------------------------------------------------------------------------
             # Cannot exceed 255 characters
             # Cannot have tabs
             try:
                 x = res[count]['phenotypes']
                 y = ' '.join(res[count]['phenotypes'])
 
                 if not x:
                     temp_attribute_dictionary['phenotypes'] = ''
                 else:
                     temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ")
             except:
                 temp_attribute_dictionary['phenotypes'] = ''
-                
             #-----------------------------------------------------------------------------------------------------------
             # Evidence cannot exceed 255 characters
             try:
                 x = res[count]['evidence']
                 y = ' '.join(res[count]['evidence'])
                 if not x:
                     temp_attribute_dictionary['evidence'] = ''
                 else:
                     temp_attribute_dictionary['evidence'] = y
             except:
                 temp_attribute_dictionary['evidence'] = ''
 
             #-----------------------------------------------------------------------------------------------------------
-                
             tags = ' '.join(res[count]['tags']).title()
             try:
                 if not tags:
                     temp_attribute_dictionary['tags'] = ''
                 else:
                     temp_attribute_dictionary['tags'] = tags
             except:
                 temp_attribute_dictionary['tags'] = ''
 
             #-----------------------------------------------------------------------------------------------------------
             # Mode of Inheritance (fix format)
-                
             MOI = ' '.join(res[count]['mode_of_inheritance']).replace("  ", "???").replace(" ", "").replace("???", " ")
             try:
                 if not MOI:
                     temp_attribute_dictionary['mode_of_inheritance'] = ''
                 else:
                     temp_attribute_dictionary['mode_of_inheritance'] = MOI
             except:
                 temp_attribute_dictionary['mode_of_inheritance'] = ''
 
             #-----------------------------------------------------------------------------------------------------------
             # For values with spaces 
-
             gene_type_list = ['entity_name', 'penetrance']
 
             for attribute in gene_type_list:
                 try:
                     temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "")
                 except:
                     temp_attribute_dictionary[attribute] = ''
-
             #-----------------------------------------------------------------------------------------------------------
             # For values with spaces and need to be capitalized
             attribute = 'entity_type'
 
             try:
                 temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize()
             except:
                 temp_attribute_dictionary[attribute] = ''
-
             #-----------------------------------------------------------------------------------------------------------
             attribute = 'mode_of_pathogenicity'
 
             try:
                 mode = ' '.join(res[count][attribute]).replace("  ", "???").replace(" ", "").replace("???", " ")
                 if mode[0] == 'L' or mode[0] == 'l':
                     temp_attribute_dictionary[attribute] = 'Loss-of-function variants'
                 elif mode[0] == 'G' or mode[0] == 'g':
                     temp_attribute_dictionary[attribute] = 'Gain-of-function'
                 elif mode[0] == 'O' or mode[0] == 'o':
                     temp_attribute_dictionary[attribute] = 'Other'
                 else:
                     temp_attribute_dictionary[attribute] = mode
             except:
                 temp_attribute_dictionary[attribute] = ''
@@ -355,45 +407,39 @@
                     repeat38.append(temp_attribute_dictionary['label'])
                     blockSizes = int(gene_range_38[1]) - int(gene_range_38[0])
                     hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], 
                                             score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, 
                                             temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                             temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                             temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                             temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                             temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                             temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                             temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                             temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
             count = count + 1
             continuous_count = continuous_count + 1
 
-        else:
-            Error = False        # End of all pages
-
-        page_count = page_count + 1
-
     print('Genes with missing coordinates in one assembly (written to missing_genes.txt):')
     print(genes_missing_info)
 
     print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):')
     missSyms = []
     for miss in genes_no_location:
         missSyms.append(miss["gene_symbol"])
     print(",".join(missSyms))
 
-
     missOfh = open("missing_genes.txt", "w")
     missOfh.write("* Not found in one assembly:\n")
     missOfh.write("\n".join(genes_missing_info))
     missOfh.write("* No location at all:\n")
     for miss in genes_no_location:
         missOfh.write("\t"+str(miss))
         missOfh.write("\n")
     missOfh.close()
 
     return(hg19_dict, hg38_dict)
 
 def downloadGenes():
     jsonFh = gzip.open("currentJson/genes.json.gz", "w")
 
     hg19_dict, hg38_dict = getGenesLocations(jsonFh)