da377fdc8a33c352274795c5817d8d117e03ae10 max Tue Oct 4 03:30:08 2022 -0700 more panelApp fixes, partially found by QA, refs #25568 diff --git src/hg/utils/otto/panelApp/genes.py src/hg/utils/otto/panelApp/genes.py index b1be4ec..2e2d2db 100755 --- src/hg/utils/otto/panelApp/genes.py +++ src/hg/utils/otto/panelApp/genes.py @@ -52,38 +52,39 @@ try: ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location'] location_37 = ensembl_genes_GRch37_82_location.split(':') chromo_37 = 'chr'+location_37[0] gene_range_37 = location_37[1].split('-') # on hg19, we have added a chrMT sequence later. except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19") try: ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location'] location_38 = ensembl_genes_GRch38_90_location.split(':') chromo_38 = 'chr'+location_38[0] # Change mitochondrial chromosomal suffix from MT -> M for hg38 only - if chromo_38 == "MT": + if chromo_38 == "chrMT": chromo_38 = "chrM" + gene_range_38 = location_38[1].split('-') except: genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38") if gene_range_37 is None and gene_range_38 is None: - print("gene without location on any assembly: %s" % res[count]) + #print("gene without location on any assembly: %s" % res[count]) genes_no_location.append(res[count]['gene_data']) count+=1 continue score = '0' strand = '.' blockCount = '1' blockStarts = '0' #----------------------------------------------------------------------------------------------------------- gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id'] for attribute in gene_data_list: @@ -266,56 +267,46 @@ temp_attribute_dictionary['version'] = '' #----------------------------------------------------------------------------------------------------------- try: x = res[count]['panel']['relevant_disorders'] y = ' '.join(res[count]['panel']['relevant_disorders']) if not x: temp_attribute_dictionary['relevant_disorders'] = '' else: temp_attribute_dictionary['relevant_disorders'] = y except: temp_attribute_dictionary['relevant_disorders'] = '' #----------------------------------------------------------------------------------------------------------- - # Add comma separated to list of pub id - - publications = ' '.join(res[count]['publications']) - - if not publications: - temp_attribute_dictionary['publications'] = '' - else: - if re.match("^[0-9 ]+$", publications): - temp_attribute_dictionary['publications'] = publications.replace(' ', ', ') - else: - temp_attribute_dictionary['publications'] = publications - - # Remove new lines - temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "") - - # make everything a URL, as we have not only PMIDs in here - # convert numbers to Pubmed URLs - pubs = temp_attribute_dictionary['publications'].split(", ") - pubUrls = [] + # minimal effort to clean up the publication field, which is a mess of free form text + pubs = res[count]['publications'] + newPubs = [] for pub in pubs: + pub = pub.replace("\n", "") + # replace commas with html commas as unfortunately I use commasin the browser to split fields + pub = pub.replace(",", ",") + # translate unicode chars to something the genome browser can display + pub = pub.encode('ascii', 'xmlcharrefreplace').decode("ascii") if re.match("^[0-9]+$", pub): - pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) - else: - pubUrls.append(pub) + #pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub+"|PMID"+pub) + pub = "PMID"+pub + + newPubs.append(pub) - temp_attribute_dictionary['publications'] = ", ".join(pubUrls) + temp_attribute_dictionary['publications'] = ", ".join(newPubs) #----------------------------------------------------------------------------------------------------------- # MouseOverField try: mof = 'Gene: ' + temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';' temp_attribute_dictionary['mouseOverField'] = mof except: temp_attribute_dictionary['mouseOverField'] = '' #----------------------------------------------------------------------------------------------------------- # Column 4 temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')' #----------------------------------------------------------------------------------------------------------- #----------------------------------------------------------------------------------------------------------- @@ -372,36 +363,44 @@ temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'], temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']] count = count + 1 continuous_count = continuous_count + 1 else: Error = False # End of all pages page_count = page_count + 1 print('Genes with missing coordinates in one assembly (written to missing_genes.txt):') print(genes_missing_info) + print('Genes with missing coordinates in both assemblies (written to missing_genes.txt):') + missSyms = [] + for miss in genes_no_location: + missSyms.append(miss["gene_symbol"]) + print(",".join(missSyms)) + + missOfh = open("missing_genes.txt", "w") missOfh.write("* Not found in one assembly:\n") missOfh.write("\n".join(genes_missing_info)) missOfh.write("* No location at all:\n") for miss in genes_no_location: missOfh.write("\t"+str(miss)) + missOfh.write("\n") missOfh.close() return(hg19_dict, hg38_dict) def downloadGenes(): jsonFh = gzip.open("currentJson/genes.json.gz", "w") hg19_dict, hg38_dict = getGenesLocations(jsonFh) jsonFh.close() pd_19_table = pd.DataFrame.from_dict(hg19_dict) pd_38_table = pd.DataFrame.from_dict(hg38_dict) pd_19_table = pd_19_table.T pd_38_table = pd_38_table.T