src/hg/utils/otto/panelApp/genes.py 88d86eb7a11bad9d37c6908d3ab5e5c2e1fc1273

88d86eb7a11bad9d37c6908d3ab5e5c2e1fc1273
max
  Mon Oct 3 03:39:03 2022 -0700
updating panelApp otto, refs #25568

diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py
index 6a70f44..b1be4ec 100755
--- src/hg/utils/otto/panelApp/genes.py
+++ src/hg/utils/otto/panelApp/genes.py
@@ -1,425 +1,436 @@
 import os
 import requests
 import json 
 import pandas as pd 
 import sys 
 import argparse
 import re
 import gzip
 
 '''
 download panelApp data via its API (somewhat slow)
 '''
 
 # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py
 # Written by a project student, Beagan, in 2020/2021
 
 def getGenesLocations(jsonFh):
     page_count = 1
     Error = True
     hg19_dict = dict()
     hg38_dict = dict()
     repeat19 = list()
     repeat38 = list()
     continuous_count = 0
     genes_missing_info = list()
+    genes_no_location = list()
 
     while Error: 
         url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count)
         myResponse = requests.get(url)
 
         if (myResponse.ok):
             jsonData = myResponse.content
 
             jsonFh.write(jsonData)
             jsonFh.write("\n".encode())
 
             jData = json.loads(jsonData.decode())
 
             if "error" in jData.keys():
                 raise Exception("{} page count is missing.".format(page_count))
             
             res = jData['results']
             num_gene_variant = len(res)
             count = 0
             while count != num_gene_variant:
                 temp_attribute_dictionary = dict()
                 string_dict_key = 'gene_{}'.format(continuous_count)
 
+                gene_range_37 = None
+                gene_range_38 = None
+
                 try:
                     ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
+                    location_37 = ensembl_genes_GRch37_82_location.split(':')
+                    chromo_37 = 'chr'+location_37[0]
+                    gene_range_37 = location_37[1].split('-')
+                    # on hg19, we have added a chrMT sequence later.
                 except:
-                    print(count)
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19")
-                    count = count + 1
 
                 try:
                     ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location']
+                    location_38 = ensembl_genes_GRch38_90_location.split(':')
+                    chromo_38 = 'chr'+location_38[0]
+                    # Change mitochondrial chromosomal suffix from MT -> M for hg38 only
+                    if chromo_38 == "MT":
+                        chromo_38 = "chrM"
+                    gene_range_38 = location_38[1].split('-')
                 except:
-                    print(count)
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38")
-                    count = count + 1
-
-                location_37 = ensembl_genes_GRch37_82_location.split(':')
-                chromo_37 = 'chr'+location_37[0]
-                gene_range_37 = location_37[1].split('-')
 
-                location_38 = ensembl_genes_GRch38_90_location.split(':')
+                if gene_range_37 is None and gene_range_38 is None:
+                    print("gene without location on any assembly: %s" % res[count])
+                    genes_no_location.append(res[count]['gene_data'])
+                    count+=1
+                    continue
 
-                # Change mitochondrial chromosomal suffix from MT -> M, fetchrom recognize only chrM
-                chr_num = location_38[0]
 
-                if chr_num == "MT":
-                    chr_num = "M"
-                chromo_38 = 'chr'+chr_num
-
-                gene_range_38 = location_38[1].split('-')
 
                 score = '0'
                 strand = '.'
                 blockCount = '1'
-                blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
                 blockStarts = '0'
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id']
                 for attribute in gene_data_list:
                     try:
                         temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute]
                     except:
                         temp_attribute_dictionary[attribute] = ''
 
                 try:
                     temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene'])
                 except:
                     temp_attribute_dictionary['omim_gene'] = ''
 
                 try: 
                     temp_attribute_dictionary['gene_symbol'] = res[count]['gene_data']['gene_symbol']
                 except:
                     temp_attribute_dictionary['gene_symbol'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # Need to split HGNC ID
                 try:
                     hgnc = res[count]['gene_data']['hgnc_id']
                     temp_attribute_dictionary['hgnc_id'] = hgnc.split(':')[1]
                 except:
                     temp_attribute_dictionary['hgnc_id'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # Capitalize(title) gene name
 
                 try:
                     gene_name = res[count]['gene_data']['gene_name'].title()
                     temp_attribute_dictionary['gene_name'] = gene_name
                 except:
                     temp_attribute_dictionary['gene_name'] = ''
                 #-----------------------------------------------------------------------------------------------------------
                 # Biotype change protein_coding to Protein Coding
 
                 try:
                     biotype = res[count]['gene_data']['biotype']
 
                     if biotype == 'protein_coding':
                         biotype = 'Protein Coding'
                     
                     temp_attribute_dictionary['biotype'] = biotype
                     if biotype == None:
                         temp_attribute_dictionary['biotype'] = ''
                 except:
                     temp_attribute_dictionary['biotype'] = ''
-                    print(res[count])
 
                 #-----------------------------------------------------------------------------------------------------------    
 
                 try:
                     ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id']
                     ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id']
 
                 except:
                     ensembl_genes_GRch37_82_ensembl_id = ''
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags']
 
                 for attribute in gene_type_list:
                     try:
                         x = res[count][attribute]
                         if not x:
                             temp_attribute_dictionary[attribute] = ''    
                         else:
                             pre = ' '.join(res[count][attribute])
                             temp_attribute_dictionary[attribute] = pre.replace('\t', ' ')
                     except:
                         temp_attribute_dictionary[attribute] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # Cannot exceed 255 characters
                 # Cannot have tabs
                 try:
                     x = res[count]['phenotypes']
                     y = ' '.join(res[count]['phenotypes'])
 
                     if not x:
                         temp_attribute_dictionary['phenotypes'] = ''
                     else:
                         temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ")
                 except:
                     temp_attribute_dictionary['phenotypes'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
                 # Evidence cannot exceed 255 characters
                 try:
                     x = res[count]['evidence']
                     y = ' '.join(res[count]['evidence'])
                     if not x:
                         temp_attribute_dictionary['evidence'] = ''
                     else:
                         temp_attribute_dictionary['evidence'] = y
                 except:
                     temp_attribute_dictionary['evidence'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 
                 tags = ' '.join(res[count]['tags']).title()
                 try:
                     if not tags:
                         temp_attribute_dictionary['tags'] = ''
                     else:
                         temp_attribute_dictionary['tags'] = tags
                 except:
                     temp_attribute_dictionary['tags'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # Mode of Inheritance (fix format)
                 
                 MOI = ' '.join(res[count]['mode_of_inheritance']).replace("  ", "???").replace(" ", "").replace("???", " ")
                 try:
                     if not MOI:
                         temp_attribute_dictionary['mode_of_inheritance'] = ''
                     else:
                         temp_attribute_dictionary['mode_of_inheritance'] = MOI
                 except:
                     temp_attribute_dictionary['mode_of_inheritance'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # For values with spaces 
 
                 gene_type_list = ['entity_name', 'penetrance']
 
                 for attribute in gene_type_list:
                     try:
                         temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "")
                     except:
                         temp_attribute_dictionary[attribute] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 # For values with spaces and need to be capitalized
                 attribute = 'entity_type'
 
                 try:
                     temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize()
                 except:
                     temp_attribute_dictionary[attribute] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 attribute = 'mode_of_pathogenicity'
 
                 try:
                     mode = ' '.join(res[count][attribute]).replace("  ", "???").replace(" ", "").replace("???", " ")
                     if mode[0] == 'L' or mode[0] == 'l':
                         temp_attribute_dictionary[attribute] = 'Loss-of-function variants'
                     elif mode[0] == 'G' or mode[0] == 'g':
                         temp_attribute_dictionary[attribute] = 'Gain-of-function'
                     elif mode[0] == 'O' or mode[0] == 'o':
                         temp_attribute_dictionary[attribute] = 'Other'
                     else:
                         temp_attribute_dictionary[attribute] = mode
                 except:
                     temp_attribute_dictionary[attribute] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created']
 
                 for attribute in panel_list:
                     try:
                         x = res[count]['panel'][attribute]
                         if not x:
                             temp_attribute_dictionary[attribute] = ''
                         else:
                             temp_attribute_dictionary[attribute] = res[count]['panel'][attribute]
                     except:
                         temp_attribute_dictionary[attribute] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 
                 version_num = 0.0
                 try:
                     version_num = float(res[count]['panel']['version'])
                     temp_attribute_dictionary['version'] = version_num
                 except:
                     temp_attribute_dictionary['version'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 
                 try:
                     x = res[count]['panel']['relevant_disorders']
                     y = ' '.join(res[count]['panel']['relevant_disorders'])
                     if not x:
                         temp_attribute_dictionary['relevant_disorders'] = ''
                     else:
                         temp_attribute_dictionary['relevant_disorders'] = y
                 except:
                     temp_attribute_dictionary['relevant_disorders'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
                 # Add comma separated to list of pub id
 
                 publications = ' '.join(res[count]['publications'])
 
                 if not publications:
                     temp_attribute_dictionary['publications'] = ''
                 else:
                     if re.match("^[0-9 ]+$", publications):
                         temp_attribute_dictionary['publications'] = publications.replace(' ', ', ')
                     else:
                         temp_attribute_dictionary['publications'] = publications
 
                 # Remove new lines
                 temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "")
 
                 # make everything a URL, as we have not only PMIDs in here
                 # convert numbers to Pubmed URLs
                 pubs = temp_attribute_dictionary['publications'].split(", ")
                 pubUrls = []
                 for pub in pubs:
                     if re.match("^[0-9 ]+$", pub):
-                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub)
+                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub)
                     else:
                         pubUrls.append(pub)
 
                 temp_attribute_dictionary['publications'] = ", ".join(pubUrls)
 
                 #-----------------------------------------------------------------------------------------------------------
                 # MouseOverField
                 try:
                     mof = 'Gene: ' +  temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';'
                     temp_attribute_dictionary['mouseOverField'] = mof
                 except:
                     temp_attribute_dictionary['mouseOverField'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
                 # Column 4
                 temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')'
                 #-----------------------------------------------------------------------------------------------------------
 
                 #-----------------------------------------------------------------------------------------------------------
                 rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'}
 
                 # If the confidence level is set to 0, set to 1
                 if temp_attribute_dictionary['confidence_level'] == '0':
                     temp_attribute_dictionary['confidence_level'] = '1'
 
                 rgb = rgb_dict[temp_attribute_dictionary['confidence_level']]
                 rgb = rgb.strip('"')
 
                 '''
                 Replace all tab in value with spaces and removes new lines
                 '''
                 for key, item in temp_attribute_dictionary.items():
                     try:
                         if isinstance(item, int):
                             pass
                         elif isinstance(item, float):
                             pass
                         else:
                             temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r")
                     except:
                         pass
 
                 # Version Threshold = 0.99
                 max_num = float(0.99)
                 
                 if version_num > max_num: 
-                    if temp_attribute_dictionary['label'] not in repeat19:    # Removes Repeats
+                    if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None:    # Removes Repeats
                         repeat19.append(temp_attribute_dictionary['label'])
+                        blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
                         hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], 
                                                 score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, 
                                                 temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                                 temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                                 temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                                 temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                                 temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                                 temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                                 temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                                 temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
                     
-                    if temp_attribute_dictionary['label'] not in repeat38:    # Remove Repeats
+                    if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None:    # Remove Repeats
                         repeat38.append(temp_attribute_dictionary['label'])
+                        blockSizes = int(gene_range_38[1]) - int(gene_range_38[0])
                         hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], 
                                                 score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, 
                                                 temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                                 temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                                 temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                                 temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                                 temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                                 temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                                 temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                                 temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
                 count = count + 1
                 continuous_count = continuous_count + 1
     
         else:
             Error = False        # End of all pages
 
         page_count = page_count + 1
-        print(page_count)
-    print('Genes with missing coordinates (written to missing_genes.txt):')
+
+    print('Genes with missing coordinates in one assembly (written to missing_genes.txt):')
     print(genes_missing_info)
-    open("missing_genes.txt", "w").write("\n".join(genes_missing_info))
+
+    missOfh = open("missing_genes.txt", "w")
+    missOfh.write("* Not found in one assembly:\n")
+    missOfh.write("\n".join(genes_missing_info))
+    missOfh.write("* No location at all:\n")
+    for miss in genes_no_location:
+        missOfh.write("\t"+str(miss))
+    missOfh.close()
+
     return(hg19_dict, hg38_dict)
 
 def downloadGenes():
     jsonFh = gzip.open("currentJson/genes.json.gz", "w")
 
     hg19_dict, hg38_dict = getGenesLocations(jsonFh)
 
     jsonFh.close()
     
     pd_19_table = pd.DataFrame.from_dict(hg19_dict)
     pd_38_table = pd.DataFrame.from_dict(hg38_dict)
     pd_19_table = pd_19_table.T
     pd_38_table = pd_38_table.T
     pd_19_table.columns = ["chrom", "chromStart", 
         "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",
         "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID",
         "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level",
         "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", 
         "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", 
         "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"]
     pd_38_table.columns = ["chrom", "chromStart", 
         "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",
         "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID",
         "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level",
         "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", 
         "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", 
         "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"]
 
     return pd_19_table, pd_38_table
     
     #pd_19_table.to_csv('hg19_header.tsv', sep='\t', index=False)
     #pd_38_table.to_csv('hg38_header.tsv', sep='\t', index=False)
 
     #pd_19_table.to_csv('hg19_noheadertem.tsv', sep='\t', index=False, header=None) 
     #pd_38_table.to_csv('hg38_noheader.tsv', sep='\t', index=False, header=None) 
 
     #/usr/local/apache/htdocs-hgdownload/goldenPath/archive/hg38/panelApp/
 
 #if __name__ == "__main__":
     #main()