88d86eb7a11bad9d37c6908d3ab5e5c2e1fc1273 max Mon Oct 3 03:39:03 2022 -0700 updating panelApp otto, refs #25568 diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py index 6a70f44..b1be4ec 100755 --- src/hg/utils/otto/panelApp/genes.py +++ src/hg/utils/otto/panelApp/genes.py @@ -11,86 +11,88 @@ download panelApp data via its API (somewhat slow) ''' # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py # Written by a project student, Beagan, in 2020/2021 def getGenesLocations(jsonFh): page_count = 1 Error = True hg19_dict = dict() hg38_dict = dict() repeat19 = list() repeat38 = list() continuous_count = 0 genes_missing_info = list() + genes_no_location = list() while Error: url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count) myResponse = requests.get(url) if (myResponse.ok): jsonData = myResponse.content jsonFh.write(jsonData) jsonFh.write("\n".encode()) jData = json.loads(jsonData.decode()) if "error" in jData.keys(): raise Exception("{} page count is missing.".format(page_count)) res = jData['results'] num_gene_variant = len(res) count = 0 while count != num_gene_variant: temp_attribute_dictionary = dict() string_dict_key = 'gene_{}'.format(continuous_count) + gene_range_37 = None + gene_range_38 = None + try: ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] + location_37 = ensembl_genes_GRch37_82_location.split(':') + chromo_37 = 'chr'+location_37[0] + gene_range_37 = location_37[1].split('-') + # on hg19, we have added a chrMT sequence later. except: - print(count) genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19") - count = count + 1 try: ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location'] + location_38 = ensembl_genes_GRch38_90_location.split(':') + chromo_38 = 'chr'+location_38[0] + # Change mitochondrial chromosomal suffix from MT -> M for hg38 only + if chromo_38 == "MT": + chromo_38 = "chrM" + gene_range_38 = location_38[1].split('-') except: - print(count) genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") - count = count + 1 - - location_37 = ensembl_genes_GRch37_82_location.split(':') - chromo_37 = 'chr'+location_37[0] - gene_range_37 = location_37[1].split('-') - location_38 = ensembl_genes_GRch38_90_location.split(':') + if gene_range_37 is None and gene_range_38 is None: + print("gene without location on any assembly: %s" % res[count]) + genes_no_location.append(res[count]['gene_data']) + count+=1 + continue - # Change mitochondrial chromosomal suffix from MT -> M, fetchrom recognize only chrM - chr_num = location_38[0] - if chr_num == "MT": - chr_num = "M" - chromo_38 = 'chr'+chr_num - - gene_range_38 = location_38[1].split('-') score = '0' strand = '.' blockCount = '1' - blockSizes = int(gene_range_37[1]) - int(gene_range_37[0]) blockStarts = '0' #----------------------------------------------------------------------------------------------------------- gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] for attribute in gene_data_list: try: temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute] except: temp_attribute_dictionary[attribute] = '' try: temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene']) except: temp_attribute_dictionary['omim_gene'] = '' @@ -118,31 +120,30 @@ temp_attribute_dictionary['gene_name'] = '' #----------------------------------------------------------------------------------------------------------- # Biotype change protein_coding to Protein Coding try: biotype = res[count]['gene_data']['biotype'] if biotype == 'protein_coding': biotype = 'Protein Coding' temp_attribute_dictionary['biotype'] = biotype if biotype == None: temp_attribute_dictionary['biotype'] = '' except: temp_attribute_dictionary['biotype'] = '' - print(res[count]) #----------------------------------------------------------------------------------------------------------- try: ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id'] ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id'] except: ensembl_genes_GRch37_82_ensembl_id = '' #----------------------------------------------------------------------------------------------------------- gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags'] for attribute in gene_type_list: @@ -286,31 +287,31 @@ else: if re.match("^[0-9 ]+$", publications): temp_attribute_dictionary['publications'] = publications.replace(' ', ', ') else: temp_attribute_dictionary['publications'] = publications # Remove new lines temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "") # make everything a URL, as we have not only PMIDs in here # convert numbers to Pubmed URLs pubs = temp_attribute_dictionary['publications'].split(", ") pubUrls = [] for pub in pubs: if re.match("^[0-9 ]+$", pub): - pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub) + pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) else: pubUrls.append(pub) temp_attribute_dictionary['publications'] = ", ".join(pubUrls) #----------------------------------------------------------------------------------------------------------- # MouseOverField try: mof = 'Gene: ' + temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';' temp_attribute_dictionary['mouseOverField'] = mof except: temp_attribute_dictionary['mouseOverField'] = '' #----------------------------------------------------------------------------------------------------------- # Column 4 @@ -333,66 +334,76 @@ for key, item in temp_attribute_dictionary.items(): try: if isinstance(item, int): pass elif isinstance(item, float): pass else: temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r") except: pass # Version Threshold = 0.99 max_num = float(0.99) if version_num > max_num: - if temp_attribute_dictionary['label'] not in repeat19: # Removes Repeats + if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None: # Removes Repeats repeat19.append(temp_attribute_dictionary['label']) + blockSizes = int(gene_range_37[1]) - int(gene_range_37[0]) hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] - if temp_attribute_dictionary['label'] not in repeat38: # Remove Repeats + if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None: # Remove Repeats repeat38.append(temp_attribute_dictionary['label']) + blockSizes = int(gene_range_38[1]) - int(gene_range_38[0]) hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id, temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'], temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] count = count + 1 continuous_count = continuous_count + 1 else: Error = False # End of all pages page_count = page_count + 1 - print(page_count) - print('Genes with missing coordinates (written to missing_genes.txt):') + + print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') print(genes_missing_info) - open("missing_genes.txt", "w").write("\n".join(genes_missing_info)) + + missOfh = open("missing_genes.txt", "w") + missOfh.write("* Not found in one assembly:\n") + missOfh.write("\n".join(genes_missing_info)) + missOfh.write("* No location at all:\n") + for miss in genes_no_location: + missOfh.write("\t"+str(miss)) + missOfh.close() + return(hg19_dict, hg38_dict) def downloadGenes(): jsonFh = gzip.open("currentJson/genes.json.gz", "w") hg19_dict, hg38_dict = getGenesLocations(jsonFh) jsonFh.close() pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_19_table = pd_19_table.T pd_38_table = pd_38_table.T pd_19_table.columns = ["chrom", "chromStart", "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",