f2ef97844c0973a30cbbc1afcdd01b4096143a69 lrnassar Wed Jun 3 10:56:07 2026 -0700 Add date-based dataVersion to composite/superTrack otto containers. refs #36455 Composite and superTrack container pages do not display 'Data last updated at UCSC' (printUpdateTime returns early for them), so a dataVersion file is the only freshness signal a user sees there. Add one to the dbVar (dbVarSv), panelApp, clinGen (clinGenComp) and decipher (decipherContainer) containers. Each otto build script writes a per-assembly 'Last updated ' file when it actually updates the data, and the container stanza points to it via dataVersion. clinGen's container date is written by its three displayed feeds (makeDosage, makeGeneValidity, makeClinGenCspec); decipher writes hg38 only (hg19 is frozen). diff --git src/hg/utils/otto/panelApp/doPanelApp.py src/hg/utils/otto/panelApp/doPanelApp.py index 31eefe3b906..98794cade96 100755 --- src/hg/utils/otto/panelApp/doPanelApp.py +++ src/hg/utils/otto/panelApp/doPanelApp.py @@ -1,1017 +1,1024 @@ #!/hive/data/outside/otto/panelApp/venv/bin/python3 from datetime import date import pandas as pd,time import gzip, logging, re, sys, json, time, requests, shutil, os, subprocess from requests.exceptions import RequestException def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) def getArchDir(db): " return hgwdev archive directory given db " dateStr = date.today().strftime("%Y-%m-%d") archDir = "/usr/local/apache/htdocs-hgdownload/goldenPath/archive/%s/panelApp/%s" % (db, dateStr) if not os.path.isdir(archDir): os.makedirs(archDir) return archDir def writeBb(hg19Table, hg38Table, subTrack): " sort the pandas tables, write to BED and convert " for db in ["hg19", "hg38"]: archDir = getArchDir(db) bedFname = "current/%s/%s.bed" % (db, subTrack) bbFname = "current/%s/%s.bb.tmp" % (db, subTrack) if db=="hg19": pdTable = hg19Table else: pdTable = hg38Table # for cnvs, one of the arguments can be None if pdTable is None: continue pdTable.sort_values(by=['chrom','chromStart'], ascending = (True, True), inplace=True) pdTable = pdTable.applymap(lambda x: x.replace('\t', ' ') if isinstance(x, str) else x) pdTable.to_csv(bedFname, sep='\t', index=False, header=None) asFname = subTrack+".as" # -extraIndex=geneName cmd = "bedToBigBed -tab -as=%s -type=bed9+26 %s /hive/data/genomes/%s/chrom.sizes %s" % (asFname, bedFname, db, bbFname) assert(os.system(cmd)==0) # put a copy into the archive archBbFname = archDir+"/%s.bb" % subTrack shutil.copyfile(bbFname, archBbFname) def updateGbdbSymlinks(country): " update the symlinks in /gbdb. Not really necessary but kept this code just in case. " if country == "Australia": subtracks = ["genesAus", "tandRepAus", "cnvAus"] else: subtracks = ["genes", "tandRep", "cnv"] for db in ["hg19", "hg38"]: archDir = getArchDir(db) for subTrack in subtracks: if db=="hg19" and "cnv" in subTrack: continue # no cnv on hg19 cmd = "ln -sf `pwd`/current/%s/%s.bb /gbdb/%s/panelApp/%s.bb" % (db, subTrack, db, subTrack) assert(os.system(cmd)==0) def checkIfFilesTooDifferent(oldFname,newFname): # Exit if the difference is more than 10% oldItemCount = bash('bigBedInfo '+oldFname+' | grep "itemCount"') oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",","")) newItemCount = bash('bigBedInfo '+newFname+' | grep "itemCount"') newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",","")) if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount): sys.exit(f"Difference between itemCounts greater than 10%: {newItemCount}, {oldItemCount}") else: print(oldFname+" vs. new count: "+str(oldItemCount)+" - "+str(newItemCount)) def flipFiles(country): " rename the .tmp files to the final filenames " if country == "Australia": subtracks = ["genesAus", "tandRepAus", "cnvAus"] else: subtracks = ["genes", "tandRep", "cnv"] for db in ["hg19", "hg38"]: archDir = getArchDir(db) for subTrack in subtracks: if db=="hg19" and "cnv" in subTrack: # no cnvs for hg19 yet continue oldFname = "current/%s/%s.bb.tmp" % (db, subTrack) #New data just generated newFname = "current/%s/%s.bb" % (db, subTrack) #Existing .bb #Check if files are more than 10% different checkIfFilesTooDifferent(newFname,oldFname) os.replace(oldFname, newFname) def getAllPages(url, results=[]): " recursively download all pages. Stack should be big enough " try: jData = requests_get_with_retry(url) # If jData is empty create, else append if "error" in jData.keys() or not "results" in jData.keys(): raise Exception("Error in keys when downloading %s" % url) if "count" in jData and not "page" in url: print("API says that there are %d results for url %s" % (jData["count"], url)) results.extend(jData["results"]) if "next" in jData and jData["next"] is not None: # need to get next URL return getAllPages(jData["next"], results) except Exception as e: raise Exception("Error when downloading %s: %s" % (url, str(e))) return results def downloadCnvs(url): Error = True continuous_count=0 res = getAllPages(url, results=[]) num_gene_data = len(res) print("Got %d CNVs" % num_gene_data) count = 0 continuous_count = 0 hg19_dict = dict() hg38_dict = dict() for geneCount, cnvData in enumerate(res): temp_attribute_dictionary = dict() string_dict_key = 'gene_{}'.format(geneCount) chromo = cnvData['chromosome'] chromosome = 'chr' + chromo start_coordinates = cnvData['grch38_coordinates'][0] end_coordinates = cnvData['grch38_coordinates'][1] score = '0' strand = '.' thickStart = start_coordinates thickEnd = end_coordinates blockCount = '1' blockSizes = int(end_coordinates) - int(start_coordinates) blockStarts = 0 confidence_level = cnvData['confidence_level'] rgb_dict = {'0' : '100,100,100', '3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} itemRgb = rgb_dict[confidence_level] entity_name = cnvData['entity_name'] entity_type = cnvData['entity_type'] evidence = ' '.join(cnvData['evidence']) haploinsufficiency_score = cnvData.get('haploinsufficiency_score') if not haploinsufficiency_score: haploinsufficiency_score = '' moi = cnvData.get('mode_of_inheritance') if not moi: moi = '' disease_group = cnvData['panel'].get('disease_group') if not disease_group: disease_group = '' disease_sub_group = cnvData['panel'].get('disease_sub_group') if not disease_sub_group: disease_sub_group = '' # idd = Panel ID idd = cnvData['panel'].get('id') if not idd: idd = '' panel_name = cnvData['panel'].get('name') if not panel_name: panel_name = '' relevant_disorders = ' '.join(cnvData['panel'].get('relevant_disorders', [])) if not relevant_disorders: relevant_disorders = '' status = cnvData['panel'].get('status') if not status: status = '' ''' types = cnvData['panel'].get['types') if not types: types = '' else: types = str(types).replace("{","").replace("}", "").replace("'", "") types = types[1:-1] ''' types = cnvData['panel']['types'][0].get('name') version = cnvData['panel']['version'] if 'genomicsengland' in url: if float(version) < 0.99: continue if not version: version = '' else: if not version: version = '' penetrance = cnvData.get('penetrance') if not penetrance: penetrance = '' phenotypes = ' '.join(cnvData.get('phenotypes', [])) if not phenotypes: phenotypes = '' publications = ' '.join(cnvData['publications']) if not publications: publications = '' #required_overlap_percentage = cnvData['required_overlap_percentage'] tags = cnvData['tags'] if not tags: tags = '' triplosensitivity_score = cnvData.get('triplosensitivity_score') if not triplosensitivity_score: triplosensitivity_score = '' type_of_variants = None if "type_of_variants" in cnvData: type_of_variants = cnvData['type_of_variants'] if not type_of_variants: type_of_variants = '' verbose_name = cnvData.get('verbose_name') if not verbose_name: verbose_name = '' # Mouse Over Field mouseOverField = "" try: mof = 'Gene: ' + entity_name + ';' + ' Panel: ' + name + ';' + ' MOI: ' + moi + ';' + ' Phenotypes: ' + phenotypes + ';' + ' Confidence: ' + confidence_level + ';' mouseOverField = mof except: mouseOverField = '' # name name = '{} ({})'.format(entity_name, panel_name) hg38_dict[string_dict_key] = [chromosome, start_coordinates, end_coordinates, name, score, strand, thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts, confidence_level, panel_name, idd, entity_name, entity_type, evidence, haploinsufficiency_score, moi, disease_group, disease_sub_group, relevant_disorders, status, types, version, penetrance, phenotypes, publications, triplosensitivity_score, type_of_variants, verbose_name, mouseOverField] #------------------------------------------------------------------------------- continuous_count = continuous_count + 1 #count = count + 1 # Removes new lines for key, item in hg38_dict.items(): strip_list = list() for i in item: try: strip_list.append(i.replace('\t', ' ').strip().strip("\n").strip("\r")) except: strip_list.append(i) hg38_dict[key] = strip_list pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_38_table = pd_38_table.T pd_38_table.columns = ['chrom', 'chromStart', 'End', 'name', 'Score', 'strand', 'thickStart', 'thickEnd', 'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'Confidence Level', 'Panel Name', 'Panel ID', 'Entity Name', 'Entity Type', 'Evidence', 'ClinGen Haploinsufficiency Score', 'Mode of Inheritance', 'Disease Group', 'Disease Sub Group', 'Relevant Disorders', 'Status', 'Types', 'Version Created', 'Penetrance', 'Phenotypes', 'Publications', 'CinGen Triplosensitivity Score', 'Type of Variants', 'Verbose Name', 'Mouse Over Field'] #pd_38_table = pd_38_table.sort_values(by=['chromosome', 'Start'], ascending = (True, True)) #pd_38_table.to_csv('hg38_region_noheader_sorted.tsv', sep='\t', index=False, header=None) #pd_38_table.to_csv('hg38_region_header_sorted.tsv', sep='\t', index=False) return pd_38_table def downloadTandReps(url): Error = True continuous_count=0 res = getAllPages(url, results=[]) num_gene_data = len(res) print("Got %d tandem repeats from API" % num_gene_data) count = 0 continuous_count = 0 hg19_dict = dict() hg38_dict = dict() while count != num_gene_data: #if count == 10: # break string_dict_key = 'gene_{}'.format(continuous_count) temp_attribute_dictionary = dict() chromosome = res[count]['chromosome'] chromosome = 'chr{}'.format(chromosome) confidence_level = res[count]['confidence_level'] entity_name = res[count]['entity_name'] entity_type = res[count]['entity_type'] evidence = ' '.join(res[count]['evidence']) gene_data = res[count]['gene_data'] if gene_data: alias = ' '.join(gene_data.get('alias', [])) biotype = gene_data['biotype'] else: alias = "" biotype = "" try: ensembl_id_37 = gene_data['ensembl_genes']['GRch37']['82']['ensembl_id'] except: ensembl_id_37 = "None" try: ensembl_id_38 = gene_data['ensembl_genes']['GRch38']['90']['ensembl_id'] except: ensembl_id_38 = "None" gene_name = gene_data['gene_name'] #gene_symbol = gene_data['gene_symbol'] #hgnc_date_symbol_changed = gene_data['hgnc_date_symbol_changed'] hgnc_id = gene_data['hgnc_id'] hgnc_symbol = gene_data['hgnc_symbol'] if str(gene_data['omim_gene']) == "None": omim_gene = "None" else: omim_gene = ' '.join(gene_data['omim_gene']) grch37_coordinates = res[count]['grch37_coordinates'] if grch37_coordinates == None: coordinates = gene_data['ensembl_genes']['GRch37']['82']['location'] location = coordinates.split(':') grch37_coordinates = location[1].split('-') chromStart_19 = int(grch37_coordinates[0]) chromEnd_19 = int(grch37_coordinates[1]) # hg38 grch38_coordinates = res[count]['grch38_coordinates'] if grch38_coordinates == None: coordinates = gene_data['ensembl_genes']['GRch38']['90']['location'] location = coordinates.split(':') grch38_coordinates = location[1].split('-') mode_of_inheritance = res[count]['mode_of_inheritance'] normal_repeats = res[count]['normal_repeats'] chromStart_38 = int(grch38_coordinates[0]) chromEnd_38 = int(grch38_coordinates[1]) panel = res[count]['panel'] disease_group = panel['disease_group'] disease_sub_group = panel['disease_sub_group'] hash_id = panel['hash_id'] idd = panel['id'] panel_name = panel['name'] relevant_disorders = ' '.join(panel['relevant_disorders']) #relevant_disorders = relevant_disorders[:240] stats = panel['stats'] number_of_gene = stats['number_of_genes'] number_of_regions = stats['number_of_regions'] number_of_strs = stats['number_of_strs'] status = panel['status'] #description = panel['types'][0]['description'][:240] description = panel['types'][0]['description'] version = panel['version'] version_created = panel['version_created'] pathogenic_repeats = res[count]['pathogenic_repeats'] penetrance = res[count]['penetrance'] phenotypes = ' '.join(res[count]['phenotypes']) phenotypes_no_num = ''.join([i for i in phenotypes if not i.isdigit()]) publications = ' '.join(res[count]['publications']) repeated_sequence = res[count]['repeated_sequence'] tags = ' '.join(res[count]['tags']) # Check to see if panel_name is not empty if panel_name: try: panel_name = panel_name.split(' - ') panel_name = panel_name[0] name = '{} ({})'.format(hgnc_symbol, panel_name) except: name = '{} ({})'.format(hgnc_symbol, panel_name) else: name = hgnc_symbol score = 0 strand = '.' thickStart_19 = chromStart_19 thickEnd_19 = chromEnd_19 thickStart_38 = chromStart_38 thickEnd_38 = chromEnd_38 rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} # If the confidence level is set to 0, set to 1 if confidence_level == '0': confidence_level = '1' rgb = rgb_dict[confidence_level] rgb = rgb.strip('"') blockCount = 1 # Cases where coordinates are reads as string data types instead of ints try: blockSizes_19 = chromEnd_19 - chromStart_19 except: blockSizes_19 = int(chromEnd_19) - int(chromStart_19) try: blockSizes_38 = chromEnd_38 - chromStart_38 except: blockSizes_39 = int(chromEnd_38) - int(chromStart_38) blockStarts = 0 geneSymbol = hgnc_symbol #------------------------------------------------------------------------------- temp19_list = [chromosome, chromStart_19, chromEnd_19, name, score, strand, thickStart_19, thickEnd_19, rgb, blockCount, blockSizes_19, chromStart_19, geneSymbol, confidence_level, entity_type, evidence, alias, ensembl_id_37, gene_name, hgnc_id, hgnc_symbol, omim_gene, mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, version, version_created, pathogenic_repeats, penetrance, phenotypes, publications, repeated_sequence] try: temp19_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp19_list] except: hg19_dict[string_dict_key] = temp19_list #------------------------------------------------------------------------------- temp38_list = [chromosome, chromStart_38, chromEnd_38, name, score, strand, thickStart_38, thickEnd_38, rgb, blockCount, blockSizes_38, chromStart_38, geneSymbol, confidence_level, entity_type, evidence, alias, ensembl_id_38, gene_name, hgnc_id, hgnc_symbol, omim_gene, mode_of_inheritance, normal_repeats, disease_group, disease_sub_group, idd, panel_name, relevant_disorders, number_of_gene, number_of_regions, number_of_strs, description, version, version_created, pathogenic_repeats, penetrance, phenotypes, publications, repeated_sequence] try: temp38_list = [i.replace('\t', ' ').strip().strip("\n").strip("\r") for i in temp38_list] except: hg38_dict[string_dict_key] = temp38_list #------------------------------------------------------------------------------- continuous_count = continuous_count + 1 count = count + 1 pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_19_table = pd_19_table.T pd_19_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 'phenotypes', 'publications', 'repeated_sequence'] #pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) #pd_19_table.to_csv('str_hg19.bed', sep='\t', index=False, header=None) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_38_table = pd_38_table.T pd_38_table.columns = ['chrom', 'chromStart', 'chromEnd', 'name', 'score', 'strand', 'thickStart', 'thickEnd', 'rgb', 'blockCount', 'blockSizes', 'blockStarts', 'geneSymbol', 'confidence_level', 'entity_type', 'evidence', 'alias', 'ensembl_id_37', 'gene_name', 'hgnc_id', 'geneSymbol', 'omim_gene', 'mode_of_inheritance', 'normal_repeats', 'disease_group', 'disease_sub_group', 'idd', 'panel_name', 'relevant_disorders', 'number_of_gene', 'number_of_regions', 'number_of_strs', 'description', 'version', 'version_created', 'pathogenic_repeats', 'penetrance', 'phenotypes', 'publications', 'repeated_sequence'] #pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True)) #pd_38_table.to_csv('hg38_str_noheader_sorted.tsv', sep='\t', index=False, header=None) return pd_19_table, pd_38_table def requests_get_with_retry(url, max_retries=10, retry_delay=60): """ Make a GET request with retry logic for handling API throttling. Returns the parsed JSON data. """ headers = { 'User-Agent': 'UCSC-Genome-Browser-PanelApp/1.0 (contact: genome-www@soe.ucsc.edu)' } last_status_code = None last_error = None for attempt in range(max_retries): logging.debug("Getting %s (attempt %d/%d)" % (url, attempt + 1, max_retries)) try: myResponse = requests.get(url, headers=headers) last_status_code = myResponse.status_code if myResponse.ok: # Check if response is JSON content_type = myResponse.headers.get('Content-Type', '') if 'application/json' in content_type: try: data = myResponse.json() return data except json.JSONDecodeError: logging.warning(f"Failed to decode JSON on attempt {attempt + 1}/{max_retries}") else: logging.warning(f"Non-JSON response received on attempt {attempt + 1}/{max_retries}. Content-Type: {content_type}") except RequestException as e: # Catch all request-related exceptions (ConnectionError, Timeout, etc.) last_error = str(e) # Wait before retrying (except on the last attempt) if attempt < max_retries - 1: time.sleep(retry_delay) # If all retries failed error_msg = f"Failed to get valid JSON response after {max_retries} attempts for URL: {url}" if last_status_code: error_msg += f", last status code: {last_status_code}" if last_error: error_msg += f", last error: {last_error}" raise Exception(error_msg) def getPanelIds(url): #logging.basicConfig(level=logging.DEBUG) logging.basicConfig(level=logging.INFO) logging.getLogger("urllib3").propagate = False logging.info("Downloading panel IDs") panelIds = [] gotError = False while not gotError: data = requests_get_with_retry(url) for res in data["results"]: panelIds.append(res["id"]) logging.debug("Total Panel IDs downloaded: %s" % len(panelIds)) url = data["next"] if url is None: break return panelIds def downloadPanels(url): panelIds = getPanelIds(url) panelInfos = {} for panelId in panelIds: if 'england' in url: panelUrl = "https://panelapp.genomicsengland.co.uk/api/v1/panels/%d?format=json" % panelId elif 'aus' in url: panelUrl = "https://panelapp-aus.org/api/v1/panels/%d/?format=json" % panelId res = requests_get_with_retry(panelUrl) panelInfos[panelId] = res return panelInfos def getGeneSymbols(url): try: panelInfos = downloadPanels(url) except requests.exceptions.JSONDecodeError: time.sleep(30) panelInfos = downloadPanels(url) syms = set() for panelInfo in panelInfos.values(): for gene in panelInfo["genes"]: sym = gene["gene_data"]["gene_symbol"] syms.add(sym) assert(sym!="") logging.info("Got %d gene symbols" % len(syms)) return list(syms) def getGenesLocations(jsonFh,url,onlyPanels): hg19_dict = dict() hg38_dict = dict() repeat19 = list() repeat38 = list() continuous_count = 0 genes_missing_info = list() genes_no_location = list() syms = getGeneSymbols(url) for sym in syms: if 'england' in url: entityUrl = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?entity_name={}&format=json".format(sym) elif 'aus' in url: entityUrl = "https://panelapp-aus.org/api/v1/genes/?entity_name={}&format=json".format(sym) jData = requests_get_with_retry(entityUrl) # Write to file (converting back to JSON bytes for file writing) jsonData = json.dumps(jData).encode() jsonFh.write(jsonData) jsonFh.write("\n".encode()) res = jData['results'] #filter by onlyPanels early, if specified if onlyPanels is not None: res = [entry for entry in res if entry.get('panel', {}).get('id') in onlyPanels] num_gene_variant = len(res) count = 0 while count != num_gene_variant: temp_attribute_dictionary = dict() string_dict_key = 'gene_{}'.format(continuous_count) gene_range_37 = None gene_range_38 = None try: ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] location_37 = ensembl_genes_GRch37_82_location.split(':') chromo_37 = 'chr'+location_37[0] gene_range_37 = location_37[1].split('-') # on hg19, we have added a chrMT sequence later. except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19") try: ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location'] location_38 = ensembl_genes_GRch38_90_location.split(':') chromo_38 = 'chr'+location_38[0] # Change mitochondrial chromosomal suffix from MT -> M for hg38 only if chromo_38 == "chrMT": chromo_38 = "chrM" gene_range_38 = location_38[1].split('-') except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") if gene_range_37 is None and gene_range_38 is None: #print("gene without location on any assembly: %s" % res[count]) genes_no_location.append(res[count]['gene_data']) count+=1 continue score = '0' strand = '.' blockCount = '1' blockStarts = '0' #----------------------------------------------------------------------------------------------------------- gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] for attribute in gene_data_list: try: temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute] except: temp_attribute_dictionary[attribute] = '' try: temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene']) except: temp_attribute_dictionary['omim_gene'] = '' try: temp_attribute_dictionary['gene_symbol'] = res[count]['gene_data']['gene_symbol'] except: temp_attribute_dictionary['gene_symbol'] = '' #----------------------------------------------------------------------------------------------------------- # Need to split HGNC ID try: hgnc = res[count]['gene_data']['hgnc_id'] temp_attribute_dictionary['hgnc_id'] = hgnc.split(':')[1] except: temp_attribute_dictionary['hgnc_id'] = '' #----------------------------------------------------------------------------------------------------------- # Capitalize(title) gene name try: gene_name = res[count]['gene_data']['gene_name'].title() temp_attribute_dictionary['gene_name'] = gene_name except: temp_attribute_dictionary['gene_name'] = '' #----------------------------------------------------------------------------------------------------------- # Biotype change protein_coding to Protein Coding try: biotype = res[count]['gene_data']['biotype'] if biotype == 'protein_coding': biotype = 'Protein Coding' temp_attribute_dictionary['biotype'] = biotype if biotype == None: temp_attribute_dictionary['biotype'] = '' except: temp_attribute_dictionary['biotype'] = '' #----------------------------------------------------------------------------------------------------------- try: ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id'] ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id'] except: ensembl_genes_GRch37_82_ensembl_id = '' #----------------------------------------------------------------------------------------------------------- gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags'] for attribute in gene_type_list: try: x = res[count][attribute] if not x: temp_attribute_dictionary[attribute] = '' else: pre = ' '.join(res[count][attribute]) temp_attribute_dictionary[attribute] = pre.replace('\t', ' ') except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- # Cannot exceed 255 characters # Cannot have tabs try: x = res[count]['phenotypes'] y = ' '.join(res[count]['phenotypes']) if not x: temp_attribute_dictionary['phenotypes'] = '' else: temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ") except: temp_attribute_dictionary['phenotypes'] = '' #----------------------------------------------------------------------------------------------------------- # Evidence cannot exceed 255 characters try: x = res[count]['evidence'] y = ' '.join(res[count]['evidence']) if not x: temp_attribute_dictionary['evidence'] = '' else: temp_attribute_dictionary['evidence'] = y except: temp_attribute_dictionary['evidence'] = '' #----------------------------------------------------------------------------------------------------------- tags = ' '.join(res[count]['tags']).title() try: if not tags: temp_attribute_dictionary['tags'] = '' else: temp_attribute_dictionary['tags'] = tags except: temp_attribute_dictionary['tags'] = '' #----------------------------------------------------------------------------------------------------------- # Mode of Inheritance (fix format) MOI = ' '.join(res[count]['mode_of_inheritance']).replace(" ", "???").replace(" ", "").replace("???", " ") try: if not MOI: temp_attribute_dictionary['mode_of_inheritance'] = '' else: temp_attribute_dictionary['mode_of_inheritance'] = MOI except: temp_attribute_dictionary['mode_of_inheritance'] = '' #----------------------------------------------------------------------------------------------------------- # For values with spaces gene_type_list = ['entity_name', 'penetrance'] for attribute in gene_type_list: try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "") except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- # For values with spaces and need to be capitalized attribute = 'entity_type' try: temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize() except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- attribute = 'mode_of_pathogenicity' try: mode = ' '.join(res[count][attribute]).replace(" ", "???").replace(" ", "").replace("???", " ") if mode[0] == 'L' or mode[0] == 'l': temp_attribute_dictionary[attribute] = 'Loss-of-function variants' elif mode[0] == 'G' or mode[0] == 'g': temp_attribute_dictionary[attribute] = 'Gain-of-function' elif mode[0] == 'O' or mode[0] == 'o': temp_attribute_dictionary[attribute] = 'Other' else: temp_attribute_dictionary[attribute] = mode except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created'] for attribute in panel_list: try: x = res[count]['panel'][attribute] if not x: temp_attribute_dictionary[attribute] = '' else: temp_attribute_dictionary[attribute] = x except: temp_attribute_dictionary[attribute] = '' #----------------------------------------------------------------------------------------------------------- version_num = 0.0 try: version_num = float(res[count]['panel']['version']) temp_attribute_dictionary['version'] = version_num except: temp_attribute_dictionary['version'] = '' #----------------------------------------------------------------------------------------------------------- try: x = res[count]['panel']['relevant_disorders'] y = ' '.join(res[count]['panel']['relevant_disorders']) if not x: temp_attribute_dictionary['relevant_disorders'] = '' else: temp_attribute_dictionary['relevant_disorders'] = y except: temp_attribute_dictionary['relevant_disorders'] = '' #----------------------------------------------------------------------------------------------------------- # minimal effort to clean up the publication field, which is a mess of free form text pubs = res[count]['publications'] newPubs = [] for pub in pubs: pub = pub.replace("\n", "") # replace commas with html commas as unfortunately I use commasin the browser to split fields pub = pub.replace(",", ",") # translate unicode chars to something the genome browser can display pub = pub.encode('ascii', 'xmlcharrefreplace').decode("ascii") if re.match("^[0-9]+$", pub): #pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) pub = "PMID"+pub newPubs.append(pub) temp_attribute_dictionary['publications'] = ", ".join(newPubs) #----------------------------------------------------------------------------------------------------------- # MouseOverField try: mof = 'Gene: ' + temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';' temp_attribute_dictionary['mouseOverField'] = mof except: temp_attribute_dictionary['mouseOverField'] = '' #----------------------------------------------------------------------------------------------------------- # Column 4 temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')' #----------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------- rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'} # If the confidence level is set to 0, set to 1 if temp_attribute_dictionary['confidence_level'] == '0': temp_attribute_dictionary['confidence_level'] = '1' rgb = rgb_dict[temp_attribute_dictionary['confidence_level']] rgb = rgb.strip('"') ''' Replace all tab in value with spaces and removes new lines ''' for key, item in temp_attribute_dictionary.items(): try: if isinstance(item, int): pass elif isinstance(item, float): pass else: temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r") except: pass if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None: # Removes Repeats repeat19.append(temp_attribute_dictionary['label']) blockSizes = int(gene_range_37[1]) - int(gene_range_37[0]) hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None: # Remove Repeats repeat38.append(temp_attribute_dictionary['label']) blockSizes = int(gene_range_38[1]) - int(gene_range_38[0]) hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] count = count + 1 continuous_count = continuous_count + 1 print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') print(genes_missing_info) print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):') missSyms = [] for miss in genes_no_location: missSyms.append(miss["gene_symbol"]) print(",".join(missSyms)) missOfh = open("missing_genes.txt", "w") missOfh.write("* Not found in one assembly:\n") missOfh.write("\n".join(genes_missing_info)) missOfh.write("* No location at all:\n") for miss in genes_no_location: missOfh.write("\t"+str(miss)) missOfh.write("\n") missOfh.close() return(hg19_dict, hg38_dict) def downloadGenes(url, onlyPanels=None): jsonFh = gzip.open("currentJson/genes.json.gz", "w") hg19_dict, hg38_dict = getGenesLocations(jsonFh,url, onlyPanels) jsonFh.close() pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_19_table = pd_19_table.T pd_38_table = pd_38_table.T pd_19_table.columns = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] pd_38_table.columns = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb", "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID", "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level", "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"] return pd_19_table, pd_38_table def main(): " create the 2 x three BED files and convert each to bigBed and update the archive " # the script uses relative pathnames, so make sure we're always in the right directory os.chdir("/hive/data/outside/otto/panelApp") # First update panelApp England # Gene panels track hg19Bed, hg38Bed = downloadGenes("https://panelapp.genomicsengland.co.uk/api/v1/panels/?format=json") writeBb(hg19Bed, hg38Bed, "genes") # STRs track hg19Bed, hg38Bed = downloadTandReps("https://panelapp.genomicsengland.co.uk/api/v1/strs/?format=json") writeBb(hg19Bed, hg38Bed, "tandRep") # CNV track hg38Bed = downloadCnvs('https://panelapp.genomicsengland.co.uk/api/v1/regions/?format=json') # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025 writeBb(None, hg38Bed, "cnv") flipFiles('England') updateGbdbSymlinks('England') # Now update panelApp Australia # Genes track hg19Bed, hg38Bed = downloadGenes("https://panelapp-aus.org/api/v1/panels/?format=json") writeBb(hg19Bed, hg38Bed, "genesAus") # STRs track hg19Bed, hg38Bed = downloadTandReps("https://panelapp-aus.org/api/v1/strs/?format=json") writeBb(hg19Bed, hg38Bed, "tandRepAus") # CNVs track hg38Bed = downloadCnvs('https://panelapp-aus.org/api/v1/regions/?format=json') # no hg19 CNV data yet from PanelApp - still true as of 5/20/2025 writeBb(None, hg38Bed, "cnvAus") flipFiles('Australia') updateGbdbSymlinks('Australia') + # panelApp is a compositeTrack whose container page shows no "Data last updated"; + # write a dataVersion file (the build/run date) so the container shows a freshness date. + dateStr = date.today().strftime("%Y-%m-%d") + for db in ["hg19", "hg38"]: + with open("current/%s/version.txt" % db, "w") as vf: + vf.write("Last updated %s\n" % dateStr) + print("PanelApp otto update: OK") main()