1b1fe488b3bcf3f500a719d757b2b0a98e911f64 lrnassar Tue Oct 8 14:42:36 2024 -0700 Releasing ClinGen CSpec track and adding it to otto, refs #33794 diff --git src/hg/utils/otto/clinGen/makeClinGenCspec.sh src/hg/utils/otto/clinGen/makeClinGenCspec.sh new file mode 100755 index 0000000..9b75e44 --- /dev/null +++ src/hg/utils/otto/clinGen/makeClinGenCspec.sh @@ -0,0 +1,212 @@ +#! /bin/bash + +cd /hive/data/outside/otto/clinGen/clinGenCspec + +wget -q -O svis.json https://cspec.genome.network/cspec/api/svis +wget -q http://purl.obolibrary.org/obo/mondo.json +wget -q -O geneToDisease.csv https://search.clinicalgenome.org/kb/gene-validity/download +bigBedToBed /gbdb/hg38/hgnc/hgnc.bb hgnc.bed + +oldCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount") +oldCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount") + +python3 - << END | sort -k1,1 -k2,2n > cspec.bed +import json +import re +import sys + +# create a dict that matches MONDO ID with disease +mondoDict = dict() +jsonData = json.load(open("mondo.json")) +mondo = jsonData["graphs"] +nodes = mondo[0]['nodes'] +for item in nodes: + if item['id'].startswith('http'): + mondoID = item['id'].split('/')[-1] + if mondoID.startswith('MONDO_'): + lbl = 'not specified' + if 'lbl' in item: + lbl = item['lbl'] + mondoDict[mondoID] = lbl + +jsonData = json.load(open("svis.json")) +data = jsonData["data"] + +# some genes occur more than once. In those cases only the @id and ruleset seems to differ +# e.g. for ACTA1: +# https://cspec.genome.network/cspec/api/SequenceVariantInterpretation/id/GN147 +# https://cspec.genome.network/cspec/api/SequenceVariantInterpretation/id/GN169 +# since the relevant stuff is the same we can just keep that. + +mane = dict() +with open('hgnc.bed', 'r') as bed: + for line in bed: + fields = line.split('\t') + fields[3] = fields[9] # replace name + mane[fields[9]] = ('\t').join(fields[:8]) # remove color + +colors_dict = { + "Classification Rules In Prep": "128,0,128", # Dark Purple + "Classification Rules Submitted": "0,0,139", # Dark blue + "Pilot Rules In Prep": "0,100,0", # Dark Green + "Pilot Rules Submitted": "139,0,0", # Dark Red + "Released": "0,0,0" # Black +} + +# disease can be looked up by MONDO id +# the csv is poorly formatted so don't use csv module +mondo = dict() +with open('geneToDisease.csv', 'r') as inf: + for line in inf: + fields = line.split(',') + mondoID = fields[3].strip('"') + disease = fields[2].strip('"') + mondo[mondoID] = disease + +seen = [] + +for panel in data: + for rset in panel["ruleSets"]: + if 'genes' in rset: + for gene in rset['genes']: + name = gene['label'] + if name in seen: + continue + seen.append(name) + if not name in mane: + print('WARNING, cannot find', name, file=sys.stderr) + disease = 'no MONDO ID specified' + if "diseases" in gene: + try: + mondoID = gene["diseases"][0]["label"].replace(':', '_') + disease = f'{mondoID}, {mondoDict[mondoID]}' + except: + mondoID = "No MONDO ID" + disease = f'{mondoID}' + url = panel['url'] + diseaseURL = f'<a target="_blank" href="{url}">{disease}</a>' + affiliationURL = panel["affiliation"]["url"] + affURL = f'<a target="_blank" href="{affiliationURL}">{panel["affiliation"]["label"]}</a>' + status = panel['status'] + color = '0' + print(f'{mane[name]}\t{color}\t{diseaseURL}\t{affURL}\t{status}') +END + +cat << '_EOF_' > clinGenCspec.as +table clinGenCspec +"Cspecs for Clingen genes" + ( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Short Name of item" + uint score; "Score from 0-1000" + char[1] strand; "+ or -" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "Used as itemRgb as of 2004-11-22" + lstring disease; "Disease" + lstring panel; "CSPEC panel" + lstring status; "Status" + ) +_EOF_ + +bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg38/chrom.sizes clinGenCspecHg38.bb +rm cspec.bed + +bigBedToBed /gbdb/hg19/hgnc/hgnc.bb hgnc.bed + +python3 - << END | sort -k1,1 -k2,2n > cspec.bed +import json +import re +import sys + +# create a dict that matches MONDO ID with disease +mondoDict = dict() +jsonData = json.load(open("mondo.json")) +mondo = jsonData["graphs"] +nodes = mondo[0]['nodes'] +for item in nodes: + if item['id'].startswith('http'): + mondoID = item['id'].split('/')[-1] + if mondoID.startswith('MONDO_'): + lbl = 'not specified' + if 'lbl' in item: + lbl = item['lbl'] + mondoDict[mondoID] = lbl + +jsonData = json.load(open("svis.json")) +data = jsonData["data"] + +# some genes occur more than once. In those cases only the @id and ruleset seems to differ +# e.g. for ACTA1: +# https://cspec.genome.network/cspec/api/SequenceVariantInterpretation/id/GN147 +# https://cspec.genome.network/cspec/api/SequenceVariantInterpretation/id/GN169 +# since the relevant stuff is the same we can just keep that. + +mane = dict() +with open('hgnc.bed', 'r') as bed: + for line in bed: + fields = line.split('\t') + fields[3] = fields[9] # replace name + mane[fields[9]] = ('\t').join(fields[:8]) # remove color + +colors_dict = { + "Classification Rules In Prep": "128,0,128", # Dark Purple + "Classification Rules Submitted": "0,0,139", # Dark blue + "Pilot Rules In Prep": "0,100,0", # Dark Green + "Pilot Rules Submitted": "139,0,0", # Dark Red + "Released": "0,0,0" # Black +} + +# disease can be looked up by MONDO id +# the csv is poorly formatted so don't use csv module +mondo = dict() +with open('geneToDisease.csv', 'r') as inf: + for line in inf: + fields = line.split(',') + mondoID = fields[3].strip('"') + disease = fields[2].strip('"') + mondo[mondoID] = disease + +seen = [] + +for panel in data: + for rset in panel["ruleSets"]: + if 'genes' in rset: + for gene in rset['genes']: + name = gene['label'] + if name in seen: + continue + seen.append(name) + if not name in mane: + print('WARNING, cannot find', name, file=sys.stderr) + disease = 'no MONDO ID specified' + if "diseases" in gene: + try: + mondoID = gene["diseases"][0]["label"].replace(':', '_') + disease = f'{mondoID}, {mondoDict[mondoID]}' + except: + mondoID = "No MONDO ID" + disease = f'{mondoID}' + url = panel['url'] + diseaseURL = f'<a target="_blank" href="{url}">{disease}</a>' + affiliationURL = panel["affiliation"]["url"] + affURL = f'<a target="_blank" href="{affiliationURL}">{panel["affiliation"]["label"]}</a>' + status = panel['status'] + color = '0' + print(f'{mane[name]}\t{color}\t{diseaseURL}\t{affURL}\t{status}') +END + +bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg19/chrom.sizes clinGenCspecHg19.bb + +rm hgnc.bed cspec.bed mondo.json geneToDisease.csv svis.json + +newCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount") +newCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount") + +echo +echo Item counts for hg38 old vs. new bigBed. Old: $oldCountHg38 New: $newCountHg38 +echo Item counts for hg19 old vs. new bigBed. Old: $oldCountHg19 New: $newCountHg19 +echo +echo ClinGen VCEP specifications track built successfully.