src/hg/utils/otto/panelApp/genes.py 88d86eb7a11bad9d37c6908d3ab5e5c2e1fc1273

88d86eb7a11bad9d37c6908d3ab5e5c2e1fc1273
max
  Mon Oct 3 03:39:03 2022 -0700
updating panelApp otto, refs #25568

diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py
index 6a70f44..b1be4ec 100755
--- src/hg/utils/otto/panelApp/genes.py
+++ src/hg/utils/otto/panelApp/genes.py
@@ -11,86 +11,88 @@
 download panelApp data via its API (somewhat slow)
 '''
 
 # originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py
 # Written by a project student, Beagan, in 2020/2021
 
 def getGenesLocations(jsonFh):
     page_count = 1
     Error = True
     hg19_dict = dict()
     hg38_dict = dict()
     repeat19 = list()
     repeat38 = list()
     continuous_count = 0
     genes_missing_info = list()
+    genes_no_location = list()
 
     while Error: 
         url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count)
         myResponse = requests.get(url)
 
         if (myResponse.ok):
             jsonData = myResponse.content
 
             jsonFh.write(jsonData)
             jsonFh.write("\n".encode())
 
             jData = json.loads(jsonData.decode())
 
             if "error" in jData.keys():
                 raise Exception("{} page count is missing.".format(page_count))
             
             res = jData['results']
             num_gene_variant = len(res)
             count = 0
             while count != num_gene_variant:
                 temp_attribute_dictionary = dict()
                 string_dict_key = 'gene_{}'.format(continuous_count)
 
+                gene_range_37 = None
+                gene_range_38 = None
+
                 try:
                     ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
+                    location_37 = ensembl_genes_GRch37_82_location.split(':')
+                    chromo_37 = 'chr'+location_37[0]
+                    gene_range_37 = location_37[1].split('-')
+                    # on hg19, we have added a chrMT sequence later.
                 except:
-                    print(count)
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19")
-                    count = count + 1
 
                 try:
                     ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location']
+                    location_38 = ensembl_genes_GRch38_90_location.split(':')
+                    chromo_38 = 'chr'+location_38[0]
+                    # Change mitochondrial chromosomal suffix from MT -> M for hg38 only
+                    if chromo_38 == "MT":
+                        chromo_38 = "chrM"
+                    gene_range_38 = location_38[1].split('-')
                 except:
-                    print(count)
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38")
-                    count = count + 1
-
-                location_37 = ensembl_genes_GRch37_82_location.split(':')
-                chromo_37 = 'chr'+location_37[0]
-                gene_range_37 = location_37[1].split('-')
 
-                location_38 = ensembl_genes_GRch38_90_location.split(':')
+                if gene_range_37 is None and gene_range_38 is None:
+                    print("gene without location on any assembly: %s" % res[count])
+                    genes_no_location.append(res[count]['gene_data'])
+                    count+=1
+                    continue
 
-                # Change mitochondrial chromosomal suffix from MT -> M, fetchrom recognize only chrM
-                chr_num = location_38[0]
 
-                if chr_num == "MT":
-                    chr_num = "M"
-                chromo_38 = 'chr'+chr_num
-
-                gene_range_38 = location_38[1].split('-')
 
                 score = '0'
                 strand = '.'
                 blockCount = '1'
-                blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
                 blockStarts = '0'
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id']
                 for attribute in gene_data_list:
                     try:
                         temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute]
                     except:
                         temp_attribute_dictionary[attribute] = ''
 
                 try:
                     temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene'])
                 except:
                     temp_attribute_dictionary['omim_gene'] = ''
@@ -118,31 +120,30 @@
                     temp_attribute_dictionary['gene_name'] = ''
                 #-----------------------------------------------------------------------------------------------------------
                 # Biotype change protein_coding to Protein Coding
 
                 try:
                     biotype = res[count]['gene_data']['biotype']
 
                     if biotype == 'protein_coding':
                         biotype = 'Protein Coding'
                     
                     temp_attribute_dictionary['biotype'] = biotype
                     if biotype == None:
                         temp_attribute_dictionary['biotype'] = ''
                 except:
                     temp_attribute_dictionary['biotype'] = ''
-                    print(res[count])
 
                 #-----------------------------------------------------------------------------------------------------------    
 
                 try:
                     ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id']
                     ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id']
 
                 except:
                     ensembl_genes_GRch37_82_ensembl_id = ''
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags']
 
                 for attribute in gene_type_list:
@@ -286,31 +287,31 @@
                 else:
                     if re.match("^[0-9 ]+$", publications):
                         temp_attribute_dictionary['publications'] = publications.replace(' ', ', ')
                     else:
                         temp_attribute_dictionary['publications'] = publications
 
                 # Remove new lines
                 temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "")
 
                 # make everything a URL, as we have not only PMIDs in here
                 # convert numbers to Pubmed URLs
                 pubs = temp_attribute_dictionary['publications'].split(", ")
                 pubUrls = []
                 for pub in pubs:
                     if re.match("^[0-9 ]+$", pub):
-                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub)
+                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub)
                     else:
                         pubUrls.append(pub)
 
                 temp_attribute_dictionary['publications'] = ", ".join(pubUrls)
 
                 #-----------------------------------------------------------------------------------------------------------
                 # MouseOverField
                 try:
                     mof = 'Gene: ' +  temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';'
                     temp_attribute_dictionary['mouseOverField'] = mof
                 except:
                     temp_attribute_dictionary['mouseOverField'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
                 # Column 4
@@ -333,66 +334,76 @@
                 for key, item in temp_attribute_dictionary.items():
                     try:
                         if isinstance(item, int):
                             pass
                         elif isinstance(item, float):
                             pass
                         else:
                             temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r")
                     except:
                         pass
 
                 # Version Threshold = 0.99
                 max_num = float(0.99)
                 
                 if version_num > max_num: 
-                    if temp_attribute_dictionary['label'] not in repeat19:    # Removes Repeats
+                    if temp_attribute_dictionary['label'] not in repeat19 and gene_range_37 is not None:    # Removes Repeats
                         repeat19.append(temp_attribute_dictionary['label'])
+                        blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
                         hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], 
                                                 score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, 
                                                 temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                                 temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                                 temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                                 temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                                 temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                                 temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                                 temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                                 temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
                     
-                    if temp_attribute_dictionary['label'] not in repeat38:    # Remove Repeats
+                    if temp_attribute_dictionary['label'] not in repeat38 and gene_range_38 is not None:    # Remove Repeats
                         repeat38.append(temp_attribute_dictionary['label'])
+                        blockSizes = int(gene_range_38[1]) - int(gene_range_38[0])
                         hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], 
                                                 score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, 
                                                 temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
                                                 temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
                                                 temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
                                                 temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
                                                 temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                                 temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                                 temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                                 temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
                 count = count + 1
                 continuous_count = continuous_count + 1
     
         else:
             Error = False        # End of all pages
 
         page_count = page_count + 1
-        print(page_count)
-    print('Genes with missing coordinates (written to missing_genes.txt):')
+
+    print('Genes with missing coordinates in one assembly (written to missing_genes.txt):')
     print(genes_missing_info)
-    open("missing_genes.txt", "w").write("\n".join(genes_missing_info))
+
+    missOfh = open("missing_genes.txt", "w")
+    missOfh.write("* Not found in one assembly:\n")
+    missOfh.write("\n".join(genes_missing_info))
+    missOfh.write("* No location at all:\n")
+    for miss in genes_no_location:
+        missOfh.write("\t"+str(miss))
+    missOfh.close()
+
     return(hg19_dict, hg38_dict)
 
 def downloadGenes():
     jsonFh = gzip.open("currentJson/genes.json.gz", "w")
 
     hg19_dict, hg38_dict = getGenesLocations(jsonFh)
 
     jsonFh.close()
     
     pd_19_table = pd.DataFrame.from_dict(hg19_dict)
     pd_38_table = pd.DataFrame.from_dict(hg38_dict)
     pd_19_table = pd_19_table.T
     pd_38_table = pd_38_table.T
     pd_19_table.columns = ["chrom", "chromStart", 
         "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",