src/hg/utils/otto/panelApp/doPanelApp.py f8702ef92125ff1776b67fb09b7917bd346c443e

f8702ef92125ff1776b67fb09b7917bd346c443e
lrnassar
  Thu Oct 23 09:03:18 2025 -0700
PanelApp england began to heavily throttle their API access which was causing this otto to error out because we were not checking that the input was JSON in certain places. Added a new central function with retry logic, and invoked that new function in all previous request locations. Refs otto ML.

diff --git src/hg/utils/otto/panelApp/doPanelApp.py src/hg/utils/otto/panelApp/doPanelApp.py
index fadfdd96535..31eefe3b906 100755
--- src/hg/utils/otto/panelApp/doPanelApp.py
+++ src/hg/utils/otto/panelApp/doPanelApp.py
@@ -1,20 +1,21 @@
 #!/hive/data/outside/otto/panelApp/venv/bin/python3
 
 from datetime import date
-import pandas as pd 
+import pandas as pd,time 
 import gzip, logging, re, sys, json, time, requests, shutil, os, subprocess
+from requests.exceptions import RequestException
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def getArchDir(db):
     " return hgwdev archive directory given db "
     dateStr = date.today().strftime("%Y-%m-%d")
     archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/panelApp/%s" % (db, dateStr)
@@ -92,47 +93,42 @@
         for subTrack in subtracks:
             if db=="hg19" and "cnv" in subTrack:
                 # no cnvs for hg19 yet
                 continue
             oldFname = "current/%s/%s.bb.tmp" % (db, subTrack) #New data just generated
             newFname = "current/%s/%s.bb" % (db, subTrack) #Existing .bb
 
             #Check if files are more than 10% different
             checkIfFilesTooDifferent(newFname,oldFname)
             
             os.replace(oldFname, newFname)
 
 def getAllPages(url, results=[]):
     " recursively download all pages. Stack should be big enough "
     try:
-        myResponse = requests.get(url)
-        if (myResponse.ok):
-            jData = json.loads(myResponse.content.decode())
+        jData = requests_get_with_retry(url)
+        
         # If jData is empty create, else append
         if "error" in jData.keys() or not "results" in jData.keys():
             raise Exception("Error in keys when downloading %s" % url)
-
         if "count" in jData and not "page" in url:
             print("API says that there are %d results for url %s" % (jData["count"], url))
         results.extend(jData["results"])
-
         if "next" in jData and jData["next"] is not None: # need to get next URL
             return getAllPages(jData["next"], results)
-        else:
-            raise Exception("Error in object when downloading %s" % url)
-    except:
-        raise Exception("HTTP Error when downloading %s" % url)
+    except Exception as e:
+        raise Exception("Error when downloading %s: %s" % (url, str(e)))
     return results
 
 def downloadCnvs(url):
     Error = True
     continuous_count=0
     res = getAllPages(url, results=[])
 
     num_gene_data = len(res)
     print("Got %d CNVs" % num_gene_data)
     count = 0
     continuous_count = 0
     hg19_dict = dict()
     hg38_dict = dict()
 
     for geneCount, cnvData in enumerate(res):
@@ -478,128 +474,146 @@
 
     pd_38_table = pd.DataFrame.from_dict(hg38_dict)
     pd_38_table = pd_38_table.T
     pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand',
         'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 
         'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 
         'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 
         'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 
         'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 
         'phenotypes', 'publications', 'repeated_sequence']
     #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True))
     #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None)
 
     return pd_19_table, pd_38_table
 
+def requests_get_with_retry(url, max_retries=10, retry_delay=60):
+    """
+    Make a GET request with retry logic for handling API throttling.
+    Returns the parsed JSON data.
+    """
+    headers = {
+        'User-Agent': 'UCSC-Genome-Browser-PanelApp/1.0 (contact: genome-www@soe.ucsc.edu)'
+    }
+    
+    last_status_code = None
+    last_error = None
+    
+    for attempt in range(max_retries):
+        logging.debug("Getting %s (attempt %d/%d)" % (url, attempt + 1, max_retries))
+        try:
+            myResponse = requests.get(url, headers=headers)
+            last_status_code = myResponse.status_code
+            
+            if myResponse.ok:
+                # Check if response is JSON
+                content_type = myResponse.headers.get('Content-Type', '')
+                if 'application/json' in content_type:
+                    try:
+                        data = myResponse.json()
+                        return data
+                    except json.JSONDecodeError:
+                        logging.warning(f"Failed to decode JSON on attempt {attempt + 1}/{max_retries}")
+                else:
+                    logging.warning(f"Non-JSON response received on attempt {attempt + 1}/{max_retries}. Content-Type: {content_type}")
+        except RequestException as e:
+            # Catch all request-related exceptions (ConnectionError, Timeout, etc.)
+            last_error = str(e)
+        
+        # Wait before retrying (except on the last attempt)
+        if attempt < max_retries - 1:
+            time.sleep(retry_delay)
+    
+    # If all retries failed
+    error_msg = f"Failed to get valid JSON response after {max_retries} attempts for URL: {url}"
+    if last_status_code:
+        error_msg += f", last status code: {last_status_code}"
+    if last_error:
+        error_msg += f", last error: {last_error}"
+    raise Exception(error_msg)
+
 def getPanelIds(url):
     #logging.basicConfig(level=logging.DEBUG)
     logging.basicConfig(level=logging.INFO)
     logging.getLogger("urllib3").propagate = False
-
     logging.info("Downloading panel IDs")
     panelIds = []
-
     gotError = False
-    while not gotError:
-        logging.debug("Getting %s" % url)
-        myResponse = requests.get(url)
     
-        jsonData = myResponse.content
-        #jsonFh.write(jsonData)
-        #jsonFh.write("\n".encode())
-        data = json.loads(jsonData.decode())
+    while not gotError:
+        data = requests_get_with_retry(url)
         
         for res in data["results"]:
             panelIds.append(res["id"])
-
         logging.debug("Total Panel IDs downloaded:  %s" % len(panelIds))
+        
         url = data["next"]
         if url is None:
             break
     
     return panelIds
 
+
 def downloadPanels(url):
     panelIds = getPanelIds(url)
     panelInfos = {}
-
     for panelId in panelIds:
         if 'england' in url:
             panelUrl = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId
         elif 'aus' in url:
             panelUrl = "https://panelapp-aus.org/api/v1/panels/%d/?format=json" % panelId
-        logging.debug("Getting %s" % panelUrl)
-        resp = requests.get(panelUrl)
-        res  = resp.json()
         
+        res = requests_get_with_retry(panelUrl)
         panelInfos[panelId] = res
     
     return panelInfos
 
 def getGeneSymbols(url):
     try:
         panelInfos = downloadPanels(url)
     except requests.exceptions.JSONDecodeError:
         time.sleep(30)
         panelInfos = downloadPanels(url)
 
     syms = set()
     for panelInfo in panelInfos.values():
         for gene in panelInfo["genes"]:
             sym = gene["gene_data"]["gene_symbol"]
             syms.add(sym)
             assert(sym!="")
     logging.info("Got %d gene symbols" % len(syms))
     return list(syms)
 
 def getGenesLocations(jsonFh,url,onlyPanels):
     hg19_dict = dict()
     hg38_dict = dict()
     repeat19 = list()
     repeat38 = list()
     continuous_count = 0
     genes_missing_info = list()
     genes_no_location = list()
-
     syms = getGeneSymbols(url)
-
     for sym in syms:
         if 'england' in url:
             entityUrl = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?entity_name={}&format=json".format(sym)
         elif 'aus' in url:
             entityUrl = "https://panelapp-aus.org/api/v1/genes/?entity_name={}&format=json".format(sym)
         
-        count = 0
-        while True:
-            try:
-                myResponse = requests.get(entityUrl)
-                if myResponse.ok:
-                    break
-                else:
-                    logging.error("Some error on %s, retrying after 1 minute (trial %d)" % (entityUrl, count))
-            except Exception:
-                logging.error("HTTP error on %s, retrying after 1 minute (trial %d)" % (entityUrl, count))
-
-            time.sleep(60)    # Wait 1 minute before trying again
-            count += 1        # Count the number of tries before failing
-            if count > 10:    # Quit afer 10 failed attempts
-                assert False, "Cannot get URL after 10 attempts"
-
-        jsonData = myResponse.content
-        #jData = myResponse.json()
-        jData = json.loads(jsonData.decode())
+        jData = requests_get_with_retry(entityUrl)
         
+        # Write to file (converting back to JSON bytes for file writing)
+        jsonData = json.dumps(jData).encode()
         jsonFh.write(jsonData)
         jsonFh.write("\n".encode())
         
         res = jData['results']
         
         #filter by onlyPanels early, if specified
         if onlyPanels is not None:
             res = [entry for entry in res if entry.get('panel', {}).get('id') in onlyPanels]
 
         num_gene_variant = len(res)
         count = 0
         while count != num_gene_variant:
             temp_attribute_dictionary = dict()
             string_dict_key = 'gene_{}'.format(continuous_count)