e0d7fb0ced1ee02fda0654b93eb1dd89ab50a9fb lrnassar Wed Oct 16 12:38:43 2024 -0700 Adding a newfile check so as not to blindly rebuild the file very time (also has no output to reduce emails) as well as adding a 20% file difference check required to replace the file. diff --git src/hg/utils/otto/clinGen/makeClinGenCspec.sh src/hg/utils/otto/clinGen/makeClinGenCspec.sh index 0a06037..59b0d69 100755 --- src/hg/utils/otto/clinGen/makeClinGenCspec.sh +++ src/hg/utils/otto/clinGen/makeClinGenCspec.sh @@ -1,30 +1,38 @@ #! /bin/bash export LANG=en_US.UTF-8 export LC_ALL=en_US.UTF-8 cd /hive/data/outside/otto/clinGen/clinGenCspec wget -q -O svis.json https://cspec.genome.network/cspec/api/svis + +# Check if the files are the same +if cmp -s svis.json svis.json.old; then + # Files are the same, exit silently + rm svis.json + exit 0 +else + # Files are different, continue with the script or add actions + echo "Updating ClinGen VCEP track..." +fi + wget -q http://purl.obolibrary.org/obo/mondo.json wget -q -O geneToDisease.csv https://search.clinicalgenome.org/kb/gene-validity/download bigBedToBed /gbdb/hg38/hgnc/hgnc.bb hgnc.bed -oldCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount") -oldCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount") - python3 - << END | sort -k1,1 -k2,2n > cspec.bed import json import re import sys # create a dict that matches MONDO ID with disease mondoDict = dict() jsonData = json.load(open("mondo.json")) mondo = jsonData["graphs"] nodes = mondo[0]['nodes'] for item in nodes: if item['id'].startswith('http'): mondoID = item['id'].split('/')[-1] if mondoID.startswith('MONDO_'): lbl = 'not specified' @@ -102,31 +110,31 @@ string chrom; "Reference sequence chromosome or scaffold" uint chromStart; "Start position in chromosome" uint chromEnd; "End position in chromosome" string name; "Short Name of item" uint score; "Score from 0-1000" char[1] strand; "+ or -" uint thickStart; "Start of where display should be thick (start codon)" uint thickEnd; "End of where display should be thick (stop codon)" uint reserved; "Used as itemRgb as of 2004-11-22" lstring disease; "Disease" lstring panel; "CSPEC panel" lstring status; "Status" ) _EOF_ -bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg38/chrom.sizes clinGenCspecHg38.bb +bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg38/chrom.sizes clinGenCspecHg38.new.bb rm cspec.bed bigBedToBed /gbdb/hg19/hgnc/hgnc.bb hgnc.bed python3 - << END | sort -k1,1 -k2,2n > cspec.bed import json import re import sys # create a dict that matches MONDO ID with disease mondoDict = dict() jsonData = json.load(open("mondo.json")) mondo = jsonData["graphs"] nodes = mondo[0]['nodes'] for item in nodes: @@ -189,27 +197,52 @@ try: mondoID = gene["diseases"][0]["label"].replace(':', '_') disease = f'{mondoID}, {mondoDict[mondoID]}' except: mondoID = "No MONDO ID" disease = f'{mondoID}' url = panel['url'] diseaseURL = f'<a target="_blank" href="{url}">{disease}</a>' affiliationURL = panel["affiliation"]["url"] affURL = f'<a target="_blank" href="{affiliationURL}">{panel["affiliation"]["label"]}</a>' status = panel['status'] color = '0' print(f'{mane[name]}\t{color}\t{diseaseURL}\t{affURL}\t{status}') END -bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg19/chrom.sizes clinGenCspecHg19.bb +bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg19/chrom.sizes clinGenCspecHg19.new.bb + +rm hgnc.bed cspec.bed mondo.json geneToDisease.csv + +oldCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount" | awk '{print $NF}') +oldCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount" | awk '{print $NF}') -rm hgnc.bed cspec.bed mondo.json geneToDisease.csv svis.json +newCountHg38=$(bigBedInfo clinGenCspecHg38.new.bb | grep -i "itemCount" | awk '{print $NF}') +newCountHg19=$(bigBedInfo clinGenCspecHg19.new.bb | grep -i "itemCount" | awk '{print $NF}') + +# Calculate the percentage difference +diffHg38=$(echo "scale=2; (($newCountHg38 - $oldCountHg38) / $oldCountHg38) * 100" | bc) +diffHg19=$(echo "scale=2; (($newCountHg19 - $oldCountHg19) / $oldCountHg19) * 100" | bc) + +# Get the absolute values of the differences +absDiffHg38=$(echo "$diffHg38" | sed 's/-//') +absDiffHg19=$(echo "$diffHg19" | sed 's/-//') + +# Check if the absolute difference is greater than 20% +if (( $(echo "$absDiffHg38 > 20" | bc -l) || $(echo "$absDiffHg19 > 20" | bc -l) )); then + echo + echo "Error: Difference in item count exceeds 20%." + echo "Difference in hg38: $diffHg38%" + echo "Difference in hg19: $diffHg19%" + exit 1 +fi -newCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount") -newCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount") +# If the difference is within the 20%, proceed +mv clinGenCspecHg38.new.bb clinGenCspecHg38.bb +mv clinGenCspecHg19.new.bb clinGenCspecHg19.bb +mv svis.json svis.json.old echo -echo Item counts for hg38 old vs. new bigBed. Old: $oldCountHg38 New: $newCountHg38 -echo Item counts for hg19 old vs. new bigBed. Old: $oldCountHg19 New: $newCountHg19 +echo "Item counts for hg38 old vs. new bigBed. Old: $oldCountHg38 New: $newCountHg38" +echo "Item counts for hg19 old vs. new bigBed. Old: $oldCountHg19 New: $newCountHg19" echo -echo ClinGen VCEP specifications track built successfully. +echo "ClinGen VCEP specifications track built successfully."