src/hg/utils/otto/panelApp/test.py c4e4f05177a63a6d7a4e42c87c54d676f2d54b73

c4e4f05177a63a6d7a4e42c87c54d676f2d54b73
max
  Fri Sep 30 08:41:25 2022 -0700
removing test file committed by mistake, no redmine

diff --git src/hg/utils/otto/panelApp/test.py src/hg/utils/otto/panelApp/test.py
deleted file mode 100644
index c9162a0..0000000
--- src/hg/utils/otto/panelApp/test.py
+++ /dev/null
@@ -1,447 +0,0 @@
-#!/usr/bin/env python3
-import os
-import requests
-import json 
-import pandas as pd 
-import sys 
-import argparse
-import re
-from datetime import date
-
-'''
-download panelApp data via its API (somewhat slow) and convert to two bigBed files into the byDay/ directory.
-Then create symlinks to them.
-'''
-
-# originally from /cluster/home/bnguy/trackhub/panel/bigBedConversion/final_version/panel_app.py
-# Written by a project student, Beagan, in 2020/2021
-
-def getGenesLocations():
-    page_count = 1
-    Error = True
-    hg19_dict = dict()
-    hg38_dict = dict()
-    repeat19 = list()
-    repeat38 = list()
-    continuous_count = 0
-    genes_missing_info = list()
-    while Error: 
-        url = "https://panelapp.genomicsengland.co.uk/api/v1/genes/?format=json&page={}".format(page_count)
-        myResponse = requests.get(url)
-
-        if (myResponse.ok):
-            jsonData = myResponse.content.decode()
-            jData = json.loads(jsonData)
-
-            if "error" in jData.keys():
-                raise Exception("{} page count is missing.".format(page_count))
-            
-            res = jData['results']
-            num_gene_variant = len(res)
-            count = 0
-            while count != num_gene_variant:
-                temp_attribute_dictionary = dict()
-                string_dict_key = 'gene_{}'.format(continuous_count)
-
-                try:
-                    ensembl_genes_GRch37_82_location = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['location']
-                except:
-                    print(count)
-                    genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg19")
-                    count = count + 1
-
-                try:
-                    ensembl_genes_GRch38_90_location = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['location']
-                except:
-                    print(count)
-                    genes_missing_info.append(res[count]['gene_data']['gene_symbol']+"/hg38")
-                    count = count + 1
-
-                location_37 = ensembl_genes_GRch37_82_location.split(':')
-                chromo_37 = 'chr'+location_37[0]
-                gene_range_37 = location_37[1].split('-')
-        
-                location_38 = ensembl_genes_GRch38_90_location.split(':')
-
-                # Change mitochondrial chromosomal suffix from MT -> M, fetchrom recognize only chrM
-                chr_num = location_38[0]
-
-                if chr_num == "MT":
-                    chr_num = "M"
-                chromo_38 = 'chr'+chr_num
-
-                gene_range_38 = location_38[1].split('-')
-
-                score = '0'
-                strand = '.'
-                blockCount = '1'
-                blockSizes = int(gene_range_37[1]) - int(gene_range_37[0])
-                blockStarts = '0'
-
-                #-----------------------------------------------------------------------------------------------------------
-
-                gene_data_list = ['gene_name', 'hgnc_symbol', 'hgnc_id']
-                for attribute in gene_data_list:
-                    try:
-                        temp_attribute_dictionary[attribute] = res[count]['gene_data'][attribute]
-                    except:
-                        temp_attribute_dictionary[attribute] = ''
-
-                try:
-                    temp_attribute_dictionary['omim_gene'] = ' '.join(res[count]['gene_data']['omim_gene'])
-                except:
-                    temp_attribute_dictionary['omim_gene'] = ''
-
-                try: 
-                    temp_attribute_dictionary['gene_symbol'] = res[count]['gene_data']['gene_symbol']
-                except:
-                    temp_attribute_dictionary['gene_symbol'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # Need to split HGNC ID
-                try:
-                    hgnc = res[count]['gene_data']['hgnc_id']
-                    temp_attribute_dictionary['hgnc_id'] = hgnc.split(':')[1]
-                except:
-                    temp_attribute_dictionary['hgnc_id'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # Capitalize(title) gene name
-
-                try:
-                    gene_name = res[count]['gene_data']['gene_name'].title()
-                    temp_attribute_dictionary['gene_name'] = gene_name
-                except:
-                    temp_attribute_dictionary['gene_name'] = ''
-                #-----------------------------------------------------------------------------------------------------------
-                # Biotype change protein_coding to Protein Coding
-
-                try:
-                    biotype = res[count]['gene_data']['biotype']
-
-                    if biotype == 'protein_coding':
-                        biotype = 'Protein Coding'
-                    
-                    temp_attribute_dictionary['biotype'] = biotype
-                    if biotype == None:
-                        temp_attribute_dictionary['biotype'] = ''
-                except:
-                    temp_attribute_dictionary['biotype'] = ''
-                    print(res[count])
-
-                #-----------------------------------------------------------------------------------------------------------    
-
-                try:
-                    ensembl_genes_GRch37_82_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch37']['82']['ensembl_id']
-                    ensembl_genes_GRch38_90_ensembl_id = res[count]['gene_data']['ensembl_genes']['GRch38']['90']['ensembl_id']
-
-                except:
-                    ensembl_genes_GRch37_82_ensembl_id = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-
-                gene_type_list = ['confidence_level', 'phenotypes', 'mode_of_inheritance', 'tags']
-
-                for attribute in gene_type_list:
-                    try:
-                        x = res[count][attribute]
-                        if not x:
-                            temp_attribute_dictionary[attribute] = ''    
-                        else:
-                            pre = ' '.join(res[count][attribute])
-                            temp_attribute_dictionary[attribute] = pre.replace('\t', ' ')
-                    except:
-                        temp_attribute_dictionary[attribute] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # Cannot exceed 255 characters
-                # Cannot have tabs
-                try:
-                    x = res[count]['phenotypes']
-                    y = ' '.join(res[count]['phenotypes'])
-
-                    if not x:
-                        temp_attribute_dictionary['phenotypes'] = ''
-                    else:
-                        temp_attribute_dictionary['phenotypes'] = y.replace("\t", " ")
-                except:
-                    temp_attribute_dictionary['phenotypes'] = ''
-                
-                #-----------------------------------------------------------------------------------------------------------
-                # Evidence cannot exceed 255 characters
-                try:
-                    x = res[count]['evidence']
-                    y = ' '.join(res[count]['evidence'])
-                    if not x:
-                        temp_attribute_dictionary['evidence'] = ''
-                    else:
-                        temp_attribute_dictionary['evidence'] = y
-                except:
-                    temp_attribute_dictionary['evidence'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                
-                tags = ' '.join(res[count]['tags']).title()
-                try:
-                    if not tags:
-                        temp_attribute_dictionary['tags'] = ''
-                    else:
-                        temp_attribute_dictionary['tags'] = tags
-                except:
-                    temp_attribute_dictionary['tags'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # Mode of Inheritance (fix format)
-                
-                MOI = ' '.join(res[count]['mode_of_inheritance']).replace("  ", "???").replace(" ", "").replace("???", " ")
-                try:
-                    if not MOI:
-                        temp_attribute_dictionary['mode_of_inheritance'] = ''
-                    else:
-                        temp_attribute_dictionary['mode_of_inheritance'] = MOI
-                except:
-                    temp_attribute_dictionary['mode_of_inheritance'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # For values with spaces 
-
-                gene_type_list = ['entity_name', 'penetrance']
-
-                for attribute in gene_type_list:
-                    try:
-                        temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "")
-                    except:
-                        temp_attribute_dictionary[attribute] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                # For values with spaces and need to be capitalized
-                attribute = 'entity_type'
-
-                try:
-                    temp_attribute_dictionary[attribute] = ' '.join(res[count][attribute]).replace(" ", "").capitalize()
-                except:
-                    temp_attribute_dictionary[attribute] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                attribute = 'mode_of_pathogenicity'
-
-                try:
-                    mode = ' '.join(res[count][attribute]).replace("  ", "???").replace(" ", "").replace("???", " ")
-                    if mode[0] == 'L' or mode[0] == 'l':
-                        temp_attribute_dictionary[attribute] = 'Loss-of-function variants'
-                    elif mode[0] == 'G' or mode[0] == 'g':
-                        temp_attribute_dictionary[attribute] = 'Gain-of-function'
-                    elif mode[0] == 'O' or mode[0] == 'o':
-                        temp_attribute_dictionary[attribute] = 'Other'
-                    else:
-                        temp_attribute_dictionary[attribute] = mode
-                except:
-                    temp_attribute_dictionary[attribute] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-
-                panel_list = ['id','name', 'disease_group', 'disease_sub_group', 'status', 'version_created']
-
-                for attribute in panel_list:
-                    try:
-                        x = res[count]['panel'][attribute]
-                        if not x:
-                            temp_attribute_dictionary[attribute] = ''
-                        else:
-                            temp_attribute_dictionary[attribute] = res[count]['panel'][attribute]
-                    except:
-                        temp_attribute_dictionary[attribute] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                
-                version_num = 0.0
-                try:
-                    version_num = float(res[count]['panel']['version'])
-                    temp_attribute_dictionary['version'] = version_num
-                except:
-                    temp_attribute_dictionary['version'] = ''
-
-                #-----------------------------------------------------------------------------------------------------------
-                
-                try:
-                    x = res[count]['panel']['relevant_disorders']
-                    y = ' '.join(res[count]['panel']['relevant_disorders'])
-                    if not x:
-                        temp_attribute_dictionary['relevant_disorders'] = ''
-                    else:
-                        temp_attribute_dictionary['relevant_disorders'] = y
-                except:
-                    temp_attribute_dictionary['relevant_disorders'] = ''
-                
-                #-----------------------------------------------------------------------------------------------------------
-                # Add comma separated to list of pub id
-
-                publications = ' '.join(res[count]['publications'])
-
-                if not publications:
-                    temp_attribute_dictionary['publications'] = ''
-                else:
-                    if re.match("^[0-9 ]+$", publications):
-                        temp_attribute_dictionary['publications'] = publications.replace(' ', ', ')
-                    else:
-                        temp_attribute_dictionary['publications'] = publications
-
-                # Remove new lines
-                temp_attribute_dictionary['publications'] = temp_attribute_dictionary['publications'].replace("\n", "")
-
-                # make everything a URL, as we have not only PMIDs in here
-                # convert numbers to Pubmed URLs
-                pubs = temp_attribute_dictionary['publications'].split(", ")
-                pubUrls = []
-                for pub in pubs:
-                    if re.match("^[0-9 ]+$", pub):
-                        pubUrls.append("https://pubmed.ncbi.nlm.nih.gov/"+pub)
-                    else:
-                        pubUrls.append(pub)
-
-                temp_attribute_dictionary['publications'] = ", ".join(pubUrls)
-
-                #-----------------------------------------------------------------------------------------------------------
-                # MouseOverField
-                try:
-                    mof = 'Gene: ' +  temp_attribute_dictionary['gene_symbol'] + ';' + ' Panel: ' + temp_attribute_dictionary['name'] + ';' + ' MOI: ' + MOI + ';' + ' Phenotypes: ' + temp_attribute_dictionary['phenotypes'] + ';' + ' Confidence: ' + temp_attribute_dictionary['confidence_level'] + ';'
-                    temp_attribute_dictionary['mouseOverField'] = mof
-                except:
-                    temp_attribute_dictionary['mouseOverField'] = ''
-                
-                #-----------------------------------------------------------------------------------------------------------
-                # Column 4
-                temp_attribute_dictionary['label'] = temp_attribute_dictionary['gene_symbol'] + ' (' + temp_attribute_dictionary['name'] + ')'
-                #-----------------------------------------------------------------------------------------------------------
-
-                #-----------------------------------------------------------------------------------------------------------
-                rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'}
-
-                # If the confidence level is set to 0, set to 1
-                if temp_attribute_dictionary['confidence_level'] == '0':
-                    temp_attribute_dictionary['confidence_level'] = '1'
-
-                rgb = rgb_dict[temp_attribute_dictionary['confidence_level']]
-                rgb = rgb.strip('"')
-
-                '''
-                Replace all tab in value with spaces and removes new lines
-                '''
-                for key, item in temp_attribute_dictionary.items():
-                    try:
-                        if isinstance(item, int):
-                            pass
-                        elif isinstance(item, float):
-                            pass
-                        else:
-                            temp_attribute_dictionary[key] = item.replace('\t', ' ').strip().strip("\n").strip("\r")
-                    except:
-                        pass
-
-                # Version Threshold = 0.99
-                max_num = float(0.99)
-                
-                if version_num > max_num: 
-                    if temp_attribute_dictionary['label'] not in repeat19:    # Removes Repeats
-                        repeat19.append(temp_attribute_dictionary['label'])
-                        hg19_dict[string_dict_key] = [chromo_37, int(gene_range_37[0]), gene_range_37[1], temp_attribute_dictionary['label'], 
-                                                score, strand, gene_range_37[0], gene_range_37[1], rgb, blockCount, blockSizes, blockStarts, 
-                                                temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
-                                                temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
-                                                temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
-                                                temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
-                                                temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
-                                                temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
-                                                temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
-                                                temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
-                    
-                    if temp_attribute_dictionary['label'] not in repeat38:    # Remove Repeats
-                        repeat38.append(temp_attribute_dictionary['label'])
-                        hg38_dict[string_dict_key] = [chromo_38, int(gene_range_38[0]), gene_range_38[1], temp_attribute_dictionary['label'], 
-                                                score, strand, gene_range_38[0], gene_range_38[1], rgb, blockCount, blockSizes, blockStarts, 
-                                                temp_attribute_dictionary['gene_symbol'], temp_attribute_dictionary['biotype'], temp_attribute_dictionary['hgnc_id'], 
-                                                temp_attribute_dictionary['gene_name'], temp_attribute_dictionary['omim_gene'], ensembl_genes_GRch38_90_ensembl_id,
-                                                temp_attribute_dictionary['entity_type'], temp_attribute_dictionary['entity_name'], temp_attribute_dictionary['confidence_level'],    
-                                                temp_attribute_dictionary['penetrance'], temp_attribute_dictionary['mode_of_pathogenicity'], temp_attribute_dictionary['publications'], 
-                                                temp_attribute_dictionary['evidence'], temp_attribute_dictionary['phenotypes'], temp_attribute_dictionary['mode_of_inheritance'], 
-                                                temp_attribute_dictionary['tags'], temp_attribute_dictionary['id'], temp_attribute_dictionary['name'],
-                                                temp_attribute_dictionary['disease_group'], temp_attribute_dictionary['disease_sub_group'], temp_attribute_dictionary['status'], 
-                                                temp_attribute_dictionary['version'], temp_attribute_dictionary['version_created'], temp_attribute_dictionary['relevant_disorders'], temp_attribute_dictionary['mouseOverField']]
-                count = count + 1
-                continuous_count = continuous_count + 1
-    
-        else:
-            Error = False        # End of all pages
-
-        page_count = page_count + 1
-        print(page_count)
-    print('Genes with missing coordinates (written to missing_genes.txt):')
-    print(genes_missing_info)
-    open("missing_genes.txt", "w").write("\n".join(genes_missing_info))
-    return(hg19_dict, hg38_dict)
-
-def main():
-    hg19_dict, hg38_dict = getGenesLocations()
-    
-    pd_19_table = pd.DataFrame.from_dict(hg19_dict)
-    pd_38_table = pd.DataFrame.from_dict(hg38_dict)
-    pd_19_table = pd_19_table.T
-    pd_38_table = pd_38_table.T
-    pd_19_table.columns = ["chrom", "chromStart", 
-        "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",
-        "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID",
-        "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level",
-        "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", 
-        "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", 
-        "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"]
-    pd_38_table.columns = ["chrom", "chromStart", 
-        "chromEnd", "name", "score", "strand", "thickStart", "thickEnd", "itemRgb",
-        "blockCount", "blockSizes", "blockStarts", "Gene Symbol", "Biotype", "HGNC ID",
-        "Gene Name", "OMIM Gene", "Ensembl Genes", "Entity Type", "Entity Name", "Confidence Level",
-        "Penetranace", "Mode of Pathogenicity", "Publications", "Evidence", "Phenotypes", 
-        "Mode of Inheritance", "Tags", "Panel ID", "Panel Name", "Disease Group", "Disease Subgroup", 
-        "Status", "Panel Version", "Version Created", "Relevant Disorders", "MouseOverField"]
-    
-    #pd_19_table.to_csv('hg19_header.tsv', sep='\t', index=False)
-    #pd_38_table.to_csv('hg38_header.tsv', sep='\t', index=False)
-
-    #pd_19_table.to_csv('hg19_noheadertem.tsv', sep='\t', index=False, header=None) 
-    #pd_38_table.to_csv('hg38_noheader.tsv', sep='\t', index=False, header=None) 
-
-    outDir = date.today().strftime("byDay/%Y-%m-%d")
-    os.makedirs(outDir)
-
-    outFnames = {}
-    outFnames["19"] = outDir+'/hg19_sorted_noheader.tsv'
-    outFnames["38"] = outDir+'/hg38_sorted_noheader.tsv'
-
-    outBbs = {}
-    outBbs["19"] = outDir+"/panelapp_hg19.bb"
-    outBbs["38"] = outDir+"/panelapp_hg38.bb"
-
-    ''' Sort '''
-    pd_19_table = pd_19_table.sort_values(by=['chrom','chromStart'], ascending = (True, True))
-    pd_19_table.to_csv(outFnames["19"], sep='\t', index=False, header=None) 
-    
-    pd_38_table = pd_38_table.sort_values(by=['chrom','chromStart'], ascending = (True, True))
-    pd_38_table.to_csv(outFnames["38"], sep='\t', index=False, header=None) 
-
-    for db in ["19", "38"]:
-        cmd = "bedToBigBed -tab -as=panelapp.as -type=bed9+26 -extraIndex=geneName %s /hive/data/genomes/hg%s/chrom.sizes %s" % (outFnames[db], db, outBbs[db])
-        assert(os.system(cmd)==0)
-
-    # make sure that we never end up with a only one updated bb file
-    #for db in ["19", "38"]:
-        #os.rename("%s.tmp" % outBbs[db], outBbs[db])
-
-    # update the symlinks
-    for db in ["19", "38"]:
-        cmd = "ln -s %s /gbdb/hg%s/panelApp/genesPanel.bb" % (outBbs[db], db)
-        assert(os.system(cmd)==0)
-
-    print("PanelApp otto update: OP")
-
-if __name__ == "__main__":
-    main()
-
-