src/hg/utils/otto/panelApp/cnv.py f94190412d16ea558ace1ec9ea175db39aaa104b

f94190412d16ea558ace1ec9ea175db39aaa104b
max
  Fri Sep 30 08:40:43 2022 -0700
panelApp otto job, refs #25568

diff --git src/hg/utils/otto/panelApp/cnv.py src/hg/utils/otto/panelApp/cnv.py
new file mode 100644
index 0000000..75a3304
--- /dev/null
+++ src/hg/utils/otto/panelApp/cnv.py
@@ -0,0 +1,200 @@
+import requests
+import json 
+import pandas as pd 
+import sys 
+import argparse
+import re
+
+def get_url(url):
+    page_count = 1
+    jData = dict()
+    while True:
+        try:
+            new_url = "{}{}".format(url, page_count)
+            myResponse = requests.get(new_url)
+            if (myResponse.ok):
+                jData = json.loads(myResponse.content.decode())
+
+                # If jData is empty create, else append
+                if "error" in jData.keys():
+                    raise Exception("{} page count is missing.".format(page_count))
+            else:
+                print('Json data retrieved')
+                break
+        except:
+            if page_count > 1:
+                print('Json data retrieved')
+            else:
+                print("Unable to request URL")
+                sys.exit()
+            break
+        print(page_count)
+        page_count += 1
+    return jData
+
+def downloadCnvs():
+    Error = True
+    continuous_count=0
+    url = "https://panelapp.genomicsengland.co.uk/api/v1/regions/?page="
+    jData = get_url(url)
+
+    res = jData['results']
+    num_gene_data = len(res)
+    count = 0
+    continuous_count = 0
+    hg19_dict = dict()
+    hg38_dict = dict()
+
+    while count != num_gene_data:
+        temp_attribute_dictionary = dict()
+        string_dict_key = 'gene_{}'.format(continuous_count)
+        
+        chromo = res[count]['chromosome']
+        chromosome = 'chr' + chromo
+
+        start_coordinates = res[count]['grch38_coordinates'][0]
+        end_coordinates = res[count]['grch38_coordinates'][1]
+
+        score = '0'
+        strand = '.'
+        thickStart = start_coordinates
+        thickEnd = end_coordinates
+        blockCount = '1'
+        blockSizes = int(end_coordinates) - int(start_coordinates)
+        blockStarts = 0
+        
+        confidence_level = res[count]['confidence_level']
+
+        rgb_dict = {'3': '0,255,0', '2': '255,191,0', '1':'255,0,0'}
+        itemRgb = rgb_dict[confidence_level]
+        
+        entity_name = res[count]['entity_name']
+        entity_type = res[count]['entity_type']
+        evidence = ' '.join(res[count]['evidence'])
+
+        haploinsufficiency_score = res[count]['haploinsufficiency_score']
+        if not haploinsufficiency_score:
+            haploinsufficiency_score = ''
+
+        moi = res[count]['mode_of_inheritance']
+        if not moi:
+            moi = ''
+
+        disease_group = res[count]['panel']['disease_group']
+        if not disease_group:
+            disease_group = ''
+
+        disease_sub_group = res[count]['panel']['disease_sub_group']
+        if not disease_sub_group:
+            disease_sub_group = ''
+
+        # idd = Panel ID
+        idd = res[count]['panel']['id']
+        if not idd:
+            idd = ''
+
+        panel_name = res[count]['panel']['name']
+        if not panel_name:
+            panel_name = ''
+        
+        relevant_disorders = ' '.join(res[count]['panel']['relevant_disorders'])
+        if not relevant_disorders:
+            relevant_disorders = ''
+
+        status = res[count]['panel']['status']
+        if not status:
+            status = ''
+        
+        '''
+        types = res[count]['panel']['types']
+        types = str(types).replace("{","").replace("}", "").replace("'", "")
+        if not types:
+            types = ''
+        types = types[1:-1]
+        '''
+
+        types = res[count]['panel']['types'][0]['name']
+
+        version = res[count]['panel']['version']
+        if float(version) < 0.99:
+            continue
+        if not version:
+            version = ''
+
+        penetrance = res[count]['penetrance']
+        if not penetrance:
+            penetrance = ''
+
+        phenotypes = ' '.join(res[count]['phenotypes'])
+        if not phenotypes:
+            phenotypes = ''
+
+        publications = ' '.join(res[count]['publications'])
+        if not publications:
+            publications = ''
+        
+        #required_overlap_percentage = res[count]['required_overlap_percentage']
+        tags = res[count]['tags']
+        if not tags:
+            tags = ''
+    
+        triplosensitivity_score = res[count]['triplosensitivity_score']
+        if not triplosensitivity_score:
+            triplosensitivity_score = ''
+    
+        type_of_variants = res[count]['type_of_variants']
+        if not type_of_variants:
+            type_of_variants = ''
+
+        verbose_name = res[count]['verbose_name']            
+        if not verbose_name:
+            verbose_name = ''    
+
+        # Mouse Over Field
+        mouseOverField = ""
+        try:
+            mof = 'Gene: ' +  entity_name + ';' + ' Panel: ' + name + ';' + ' MOI: ' + moi + ';' + ' Phenotypes: ' + phenotypes + ';' + ' Confidence: ' + confidence_level + ';'    
+            mouseOverField = mof
+        except:
+            mouseOverField = ''        
+
+        # name
+        name = '{} ({})'.format(entity_name, panel_name)
+        
+        hg38_dict[string_dict_key] = [chromosome, start_coordinates, end_coordinates, name, score, strand, 
+                            thickStart, thickEnd, itemRgb, blockCount, blockSizes, blockStarts, confidence_level, 
+                            panel_name, idd, entity_name, entity_type, evidence, haploinsufficiency_score, moi, disease_group, 
+                            disease_sub_group, relevant_disorders, status, types, version, penetrance, phenotypes, 
+                            publications, triplosensitivity_score, type_of_variants, verbose_name, mouseOverField]
+        
+        #-------------------------------------------------------------------------------
+    
+        continuous_count = continuous_count + 1
+        count = count + 1
+
+    # Removes new lines
+    for key, item in hg38_dict.items():
+        strip_list = list()
+        for i in item:
+            try:
+                strip_list.append(i.replace('\t', ' ').strip().strip("\n").strip("\r"))
+            except:
+                strip_list.append(i)
+        hg38_dict[key] = strip_list
+
+    pd_38_table = pd.DataFrame.from_dict(hg38_dict)
+    pd_38_table = pd_38_table.T
+    pd_38_table.columns = ['chrom', 'chromStart', 'End', 'name', 'Score', 'strand', 'thickStart', 'thickEnd', 
+                            'itemRgb', 'blockCount', 'blockSizes', 'blockStarts', 'Confidence Level', 
+                            'Panel Name', 'Panel ID', 'Entity Name', 'Entity Type', 'Evidence', 'ClinGen Haploinsufficiency Score', 
+                            'Mode of Inheritance', 'Disease Group', 'Disease Sub Group', 'Relevant Disorders', 
+                            'Status', 'Types', 'Version Created', 'Penetrance', 'Phenotypes', 'Publications', 
+                            'CinGen Triplosensitivity Score', 'Type of Variants', 'Verbose Name', 'Mouse Over Field']
+    #pd_38_table = pd_38_table.sort_values(by=['chromosome', 'Start'], ascending = (True, True))
+    #pd_38_table.to_csv('hg38_region_noheader_sorted.tsv', sep='\t', index=False, header=None) 
+    #pd_38_table.to_csv('hg38_region_header_sorted.tsv', sep='\t', index=False) 
+    return pd_38_table
+
+
+#if __name__ == "__main__":
+    #main()