da377fdc8a33c352274795c5817d8d117e03ae10
max
  Tue Oct 4 03:30:08 2022 -0700
more panelApp fixes, partially found by QA, refs #25568

diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py
index b1be4ec..2e2d2db 100755
--- src/hg/utils/otto/panelApp/genes.py
+++ src/hg/utils/otto/panelApp/genes.py
@@ -52,38 +52,39 @@
 
                 try:
                     ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
                     location_37 = ensembl_genes_GRch37_82_location.split(':')
                     chromo_37 = 'chr'+location_37[0]
                     gene_range_37 = location_37[1].split('-')
                     # on hg19, we have added a chrMT sequence later.
                 except:
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19")
 
                 try:
                     ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location']
                     location_38 = ensembl_genes_GRch38_90_location.split(':')
                     chromo_38 = 'chr'+location_38[0]
                     # Change mitochondrial chromosomal suffix from MT -> M for hg38 only
-                    if chromo_38 == "MT":
+                    if chromo_38 == "chrMT":
                         chromo_38 = "chrM"
+
                     gene_range_38 = location_38[1].split('-')
                 except:
                     genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38")
 
                 if gene_range_37 is None and gene_range_38 is None:
-                    print("gene without location on any assembly: %s" % res[count])
+                    #print("gene without location on any assembly: %s" % res[count])
                     genes_no_location.append(res[count]['gene_data'])
                     count+=1
                     continue
 
 
 
                 score = '0'
                 strand = '.'
                 blockCount = '1'
                 blockStarts = '0'
 
                 #-----------------------------------------------------------------------------------------------------------
 
                 gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id']
                 for attribute in gene_data_list:
@@ -266,56 +267,46 @@
                     temp_attribute_dictionary['version'] = ''
 
                 #-----------------------------------------------------------------------------------------------------------
                 
                 try:
                     x = res[count]['panel']['relevant_disorders']
                     y = ' '.join(res[count]['panel']['relevant_disorders'])
                     if not x:
                         temp_attribute_dictionary['relevant_disorders'] = ''
                     else:
                         temp_attribute_dictionary['relevant_disorders'] = y
                 except:
                     temp_attribute_dictionary['relevant_disorders'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
-                # Add comma separated to list of pub id
-
-                publications = ' '.join(res[count]['publications'])
-
-                if not publications:
-                    temp_attribute_dictionary['publications'] = ''
-                else:
-                    if re.match("^[0-9 ]+$", publications):
-                        temp_attribute_dictionary['publications'] = publications.replace(' ', ', ')
-                    else:
-                        temp_attribute_dictionary['publications'] = publications
-
-                # Remove new lines
-                temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "")
-
-                # make everything a URL, as we have not only PMIDs in here
-                # convert numbers to Pubmed URLs
-                pubs = temp_attribute_dictionary['publications'].split(", ")
-                pubUrls = []
+                # minimal effort to clean up the publication field, which is a mess of free form text
+                pubs = res[count]['publications']
+                newPubs = []
                 for pub in pubs:
+                    pub = pub.replace("\n", "")
+                    # replace commas with html commas as unfortunately I use commasin the browser to split fields
+                    pub = pub.replace(",", ",")
+                    # translate unicode chars to something the genome browser can display
+                    pub = pub.encode('ascii', 'xmlcharrefreplace').decode("ascii")
                     if re.match("^[0-9]+$", pub):
-                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub)
-                    else:
-                        pubUrls.append(pub)
+                        #pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub)
+                        pub = "PMID"+pub
+
+                    newPubs.append(pub)
 
-                temp_attribute_dictionary['publications'] = ", ".join(pubUrls)
+                temp_attribute_dictionary['publications'] = ", ".join(newPubs)
 
                 #-----------------------------------------------------------------------------------------------------------
                 # MouseOverField
                 try:
                     mof = 'Gene: ' +  temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';'
                     temp_attribute_dictionary['mouseOverField'] = mof
                 except:
                     temp_attribute_dictionary['mouseOverField'] = ''
                 
                 #-----------------------------------------------------------------------------------------------------------
                 # Column 4
                 temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')'
                 #-----------------------------------------------------------------------------------------------------------
 
                 #-----------------------------------------------------------------------------------------------------------
@@ -372,36 +363,44 @@
                                                 temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
                                                 temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
                                                 temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
                                                 temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
                 count = count + 1
                 continuous_count = continuous_count + 1
     
         else:
             Error = False        # End of all pages
 
         page_count = page_count + 1
 
     print('Genes with missing coordinates in one assembly (written to missing_genes.txt):')
     print(genes_missing_info)
 
+    print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):')
+    missSyms = []
+    for miss in genes_no_location:
+        missSyms.append(miss["gene_symbol"])
+    print(",".join(missSyms))
+
+
     missOfh = open("missing_genes.txt", "w")
     missOfh.write("* Not found in one assembly:\n")
     missOfh.write("\n".join(genes_missing_info))
     missOfh.write("* No location at all:\n")
     for miss in genes_no_location:
         missOfh.write("\t"+str(miss))
+        missOfh.write("\n")
     missOfh.close()
 
     return(hg19_dict, hg38_dict)
 
 def downloadGenes():
     jsonFh = gzip.open("currentJson/genes.json.gz", "w")
 
     hg19_dict, hg38_dict = getGenesLocations(jsonFh)
 
     jsonFh.close()
     
     pd_19_table = pd.DataFrame.from_dict(hg19_dict)
     pd_38_table = pd.DataFrame.from_dict(hg38_dict)
     pd_19_table = pd_19_table.T
     pd_38_table = pd_38_table.T