src/hg/utils/otto/panelApp/doPanelApp.py 735f7be7f5d6854830d5ef7cc5387fc29027778d

735f7be7f5d6854830d5ef7cc5387fc29027778d
lrnassar
  Thu Jun 19 18:21:58 2025 -0700
Making improvements to the panelApp otto job, primarily in support of the new panelApp Australia track. 1. Adding our standard check that will cause an exit if the item count is more than 10% different. 2. Tweak Max's code to fix an issue when specifying the specific panels to extract, this is used for the Australia genes track. 3. Remove the check that only made track items if the versions was >.99, we now make track items out of everything. 4. Max had accidentally specified the two 'onlyPanels' for Genomics England, but it should be Australia. Refs #35758

diff --git src/hg/utils/otto/panelApp/doPanelApp.py src/hg/utils/otto/panelApp/doPanelApp.py
index 9d634e6af7b..07ff72860f3 100755
--- src/hg/utils/otto/panelApp/doPanelApp.py
+++ src/hg/utils/otto/panelApp/doPanelApp.py
@@ -1,27 +1,30 @@
 #!/hive/data/outside/otto/panelApp/venv/bin/python3
+
 from datetime import date
-import os
-import shutil
-import requests
-import time
-import json 
 import pandas as pd 
-import sys 
-import re
-import gzip
-import logging
+import gzip, logging, re, sys, json, time, requests, shutil, os, subprocess
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
 
 def getArchDir(db):
     " return hgwdev archive directory given db "
     dateStr = date.today().strftime("%Y-%m-%d")
     archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/panelApp/%s" % (db, dateStr)
     if not os.path.isdir(archDir):
         os.makedirs(archDir)
     return archDir
 
 def writeBb(hg19Table, hg38Table, subTrack):
     " sort the pandas tables, write to BED and convert "
     for db in ["hg19", "hg38"]:
         archDir = getArchDir(db)
 
         bedFname = "current/%s/%s.bed" % (db, subTrack)
@@ -52,44 +55,62 @@
 
 def updateGbdbSymlinks(country):
     " update the symlinks in /gbdb. Not really necessary but kept this code just in case. "
     if country == "Australia":
         subtracks = ["genesAus", "tandRepAus", "cnvAus"]
     else:
         subtracks = ["genes", "tandRep", "cnv"]
     for db in ["hg19", "hg38"]:
         archDir = getArchDir(db)
         for subTrack in subtracks:
             if db=="hg19" and "cnv" in subTrack:
                 continue # no cnv on hg19
             cmd = "ln -sf `pwd`/current/%s/%s.bb /gbdb/%s/panelApp/%s.bb" % (db, subTrack, db, subTrack)
             assert(os.system(cmd)==0)
 
+def checkIfFilesTooDifferent(oldFname,newFname):
+    # Exit if the difference is more than 10%
+
+    oldItemCount = bash('bigBedInfo '+oldFname+' | grep "itemCount"')
+    oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
+    
+    newItemCount = bash('bigBedInfo '+newFname+' | grep "itemCount"')
+    newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
+
+    if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount):
+        sys.exit(f"Difference between itemCounts greater than 10%: {newItemCount}, {oldItemCount}")
+    else:
+        print(oldFname+" vs. new count: "+str(oldItemCount)+" - "+str(newItemCount))
+
 def flipFiles(country):
     " rename the .tmp files to the final filenames "
     if country == "Australia":
         subtracks = ["genesAus", "tandRepAus", "cnvAus"]
     else:
         subtracks = ["genes", "tandRep", "cnv"]
     for db in ["hg19", "hg38"]:
         archDir = getArchDir(db)
         for subTrack in subtracks:
             if db=="hg19" and "cnv" in subTrack:
                 # no cnvs for hg19 yet
                 continue
             oldFname = "current/%s/%s.bb.tmp" % (db, subTrack)
             newFname = "current/%s/%s.bb" % (db, subTrack)
+
+            #Check if files are more than 10% different
+            checkIfFilesTooDifferent(oldFname,newFname)
+            
             os.replace(oldFname, newFname)
 
 def getAllPages(url, results=[]):
     " recursively download all pages. Stack should be big enough "
     try:
         myResponse = requests.get(url)
         if (myResponse.ok):
             jData = json.loads(myResponse.content.decode())
             # If jData is empty create, else append
             if "error" in jData.keys() or not "results" in jData.keys():
                 raise Exception("Error in keys when downloading %s" % url)
 
             if "count" in jData and not "page" in url:
                 print("API says that there are %d results for url %s" % (jData["count"], url))
             results.extend(jData["results"])
@@ -559,30 +580,35 @@
                 logging.error("HTTP error on %s, retrying after 1 minute (trial %d)" % (entityUrl, count))
 
             time.sleep(60)    # Wait 1 minute before trying again
             count += 1        # Count the number of tries before failing
             if count > 10:    # Quit afer 10 failed attempts
                 assert False, "Cannot get URL after 10 attempts"
 
         jsonData = myResponse.content
         #jData = myResponse.json()
         jData = json.loads(jsonData.decode())
 
         jsonFh.write(jsonData)
         jsonFh.write("\n".encode())
 
         res = jData['results']
+        
+        #filter by onlyPanels early, if specified
+        if onlyPanels is not None:
+            res = [entry for entry in res if entry.get('panel', {}).get('id') in onlyPanels]
+
         num_gene_variant = len(res)
         count = 0
         while count != num_gene_variant:
             temp_attribute_dictionary = dict()
             string_dict_key = 'gene_{}'.format(continuous_count)
 
             gene_range_37 = None
             gene_range_38 = None
 
             try:
                 ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
                 location_37 = ensembl_genes_GRch37_82_location.split(':')
                 chromo_37 = 'chr'+location_37[0]
                 gene_range_37 = location_37[1].split('-')
                 # on hg19, we have added a chrMT sequence later.
@@ -760,37 +786,34 @@
                     temp_attribute_dictionary[attribute] = 'Gain-of-function'
                 elif mode[0] == 'O' or mode[0] == 'o':
                     temp_attribute_dictionary[attribute] = 'Other'
                 else:
                     temp_attribute_dictionary[attribute] = mode
             except:
                 temp_attribute_dictionary[attribute] = ''
 
             #-----------------------------------------------------------------------------------------------------------
 
             panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created']
 
             for attribute in panel_list:
                 try:
                     x = res[count]['panel'][attribute]
-                    if attribute=="id" and onlyPanels is not None:
-                        if int(x) not in onlyPanels:
-                            continue
                     if not x:
                         temp_attribute_dictionary[attribute] = ''
                     else:
-                        temp_attribute_dictionary[attribute] = res[count]['panel'][attribute]
+                        temp_attribute_dictionary[attribute] = x
                 except:
                     temp_attribute_dictionary[attribute] = ''
 
             #-----------------------------------------------------------------------------------------------------------
             
             version_num = 0.0
             try:
                 version_num = float(res[count]['panel']['version'])
                 temp_attribute_dictionary['version'] = version_num
             except:
                 temp_attribute_dictionary['version'] = ''
 
             #-----------------------------------------------------------------------------------------------------------
             
             try:
@@ -846,61 +869,59 @@
 
             '''
             Replace all tab in value with spaces and removes new lines
             '''
             for key, item in temp_attribute_dictionary.items():
                 try:
                     if isinstance(item, int):
                         pass
                     elif isinstance(item, float):
                         pass
                     else:
                         temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r")
                 except:
                     pass
 
-            # Version Threshold = 0.99
-            max_num = float(0.99)
-            
-            if version_num > max_num: 
             if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None:    # Removes Repeats
                 repeat19.append(temp_attribute_dictionary['label'])
                 blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
                 hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], 
                                         score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, 
                                         temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                         temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                         temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                         temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                         temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                         temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                         temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                         temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
             
             if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None:    # Remove Repeats
                 repeat38.append(temp_attribute_dictionary['label'])
                 blockSizes = int(gene_range_38[1]) - int(gene_range_38[0])
                 hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], 
                                         score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, 
                                         temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                         temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                         temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                         temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                         temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                         temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                         temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                         temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
+                    
+            
             count = count + 1
             continuous_count = continuous_count + 1
 
     print('Genes with missing coordinates in one assembly (written to missing_genes.txt):')
     print(genes_missing_info)
 
     print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):')
     missSyms = []
     for miss in genes_no_location:
         missSyms.append(miss["gene_symbol"])
     print(",".join(missSyms))
 
     missOfh = open("missing_genes.txt", "w")
     missOfh.write("* Not found in one assembly:\n")
     missOfh.write("\n".join(genes_missing_info))
@@ -937,46 +958,46 @@
         "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", 
         "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", 
         "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"]
 
     return pd_19_table, pd_38_table
 
 
 def main():
     " create the 2 x three BED files and convert each to bigBed and update the archive "
 
     # the script uses relative pathnames, so make sure we're always in the right directory
     os.chdir("/hive/data/outside/otto/panelApp")
     
     # First update panelApp England
     # Gene panels track
-    hg19Bed, hg38Bed = downloadGenes("https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json", onlyPanels=[137, 126])
+    hg19Bed, hg38Bed = downloadGenes("https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json")
     writeBb(hg19Bed, hg38Bed, "genes")
     # STRs track
     hg19Bed, hg38Bed = downloadTandReps("https://panelapp.genomicsengland.co.uk/api/v1/strs/?format=json")
     writeBb(hg19Bed, hg38Bed, "tandRep")
     # CNV track
     hg38Bed = downloadCnvs('https://panelapp.genomicsengland.co.uk/api/v1/regions/?format=json')
     # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025
     writeBb(None, hg38Bed, "cnv")
 
     flipFiles('England')
     updateGbdbSymlinks('England')
 
     # Now update panelApp Australia
     # Genes track
-    hg19Bed, hg38Bed = downloadGenes("https://panelapp-aus.org/api/v1/panels/?format=json")
+    hg19Bed, hg38Bed = downloadGenes("https://panelapp-aus.org/api/v1/panels/?format=json", onlyPanels=[137, 126])
     writeBb(hg19Bed, hg38Bed, "genesAus")
     # STRs track
     hg19Bed, hg38Bed = downloadTandReps("https://panelapp-aus.org/api/v1/strs/?format=json")
     writeBb(hg19Bed, hg38Bed, "tandRepAus")
     # CNVs track
     hg38Bed = downloadCnvs('https://panelapp-aus.org/api/v1/regions/?format=json')
     # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025
     writeBb(None, hg38Bed, "cnvAus")
 
     flipFiles('Australia')
     updateGbdbSymlinks('Australia')
 
     print("PanelApp otto update: OK")
 
 main()