src/hg/utils/otto/clinGen/makeClinGenCspec.sh e0d7fb0ced1ee02fda0654b93eb1dd89ab50a9fb

e0d7fb0ced1ee02fda0654b93eb1dd89ab50a9fb
lrnassar
  Wed Oct 16 12:38:43 2024 -0700
Adding a newfile check so as not to blindly rebuild the file very time (also has no output to reduce emails) as well as adding a 20% file difference check required to replace the file.

diff --git src/hg/utils/otto/clinGen/makeClinGenCspec.sh src/hg/utils/otto/clinGen/makeClinGenCspec.sh
index 0a06037..59b0d69 100755
--- src/hg/utils/otto/clinGen/makeClinGenCspec.sh
+++ src/hg/utils/otto/clinGen/makeClinGenCspec.sh
@@ -1,30 +1,38 @@
 #! /bin/bash
 
 export LANG=en_US.UTF-8
 export LC_ALL=en_US.UTF-8
 
 cd /hive/data/outside/otto/clinGen/clinGenCspec
 
 wget -q -O svis.json https://cspec.genome.network/cspec/api/svis
+
+# Check if the files are the same
+if cmp -s svis.json svis.json.old; then
+  # Files are the same, exit silently
+  rm svis.json
+  exit 0
+else
+  # Files are different, continue with the script or add actions
+  echo "Updating ClinGen VCEP track..."
+fi
+
 wget -q http://purl.obolibrary.org/obo/mondo.json
 wget -q -O geneToDisease.csv https://search.clinicalgenome.org/kb/gene-validity/download
 bigBedToBed /gbdb/hg38/hgnc/hgnc.bb hgnc.bed
 
-oldCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount")
-oldCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount")
-
 python3 - << END | sort -k1,1 -k2,2n > cspec.bed
 import json
 import re
 import sys
 
 # create a dict that matches MONDO ID with disease
 mondoDict = dict()
 jsonData = json.load(open("mondo.json"))
 mondo = jsonData["graphs"]
 nodes = mondo[0]['nodes']
 for item in nodes:
     if item['id'].startswith('http'):
         mondoID = item['id'].split('/')[-1]
         if mondoID.startswith('MONDO_'):
             lbl = 'not specified'
@@ -102,31 +110,31 @@
     string chrom;      "Reference sequence chromosome or scaffold"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Short Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     lstring disease;   "Disease"
     lstring panel;     "CSPEC panel"
     lstring status;    "Status"
     )
 _EOF_
 
-bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg38/chrom.sizes clinGenCspecHg38.bb
+bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg38/chrom.sizes clinGenCspecHg38.new.bb
 rm cspec.bed
 
 bigBedToBed /gbdb/hg19/hgnc/hgnc.bb hgnc.bed
 
 python3 - << END | sort -k1,1 -k2,2n > cspec.bed
 import json
 import re
 import sys
 
 # create a dict that matches MONDO ID with disease
 mondoDict = dict()
 jsonData = json.load(open("mondo.json"))
 mondo = jsonData["graphs"]
 nodes = mondo[0]['nodes']
 for item in nodes:
@@ -189,27 +197,52 @@
                     try:
                         mondoID = gene["diseases"][0]["label"].replace(':', '_')
                         disease = f'{mondoID}, {mondoDict[mondoID]}'
                     except:
                         mondoID = "No MONDO ID"
                         disease = f'{mondoID}'
                 url = panel['url']
                 diseaseURL = f'<a target="_blank" href="{url}">{disease}</a>'
                 affiliationURL = panel["affiliation"]["url"]
                 affURL = f'<a target="_blank" href="{affiliationURL}">{panel["affiliation"]["label"]}</a>'
                 status = panel['status']
                 color = '0'
                 print(f'{mane[name]}\t{color}\t{diseaseURL}\t{affURL}\t{status}')
 END
 
-bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg19/chrom.sizes clinGenCspecHg19.bb
+bedToBigBed -type=bed9+2 -tab -as=clinGenCspec.as cspec.bed /hive/data/genomes/hg19/chrom.sizes clinGenCspecHg19.new.bb
+
+rm hgnc.bed cspec.bed mondo.json geneToDisease.csv
+
+oldCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount" | awk '{print $NF}')
+oldCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount" | awk '{print $NF}')
 
-rm hgnc.bed cspec.bed mondo.json geneToDisease.csv svis.json 
+newCountHg38=$(bigBedInfo clinGenCspecHg38.new.bb | grep -i "itemCount" | awk '{print $NF}')
+newCountHg19=$(bigBedInfo clinGenCspecHg19.new.bb | grep -i "itemCount" | awk '{print $NF}')
+
+# Calculate the percentage difference
+diffHg38=$(echo "scale=2; (($newCountHg38 - $oldCountHg38) / $oldCountHg38) * 100" | bc)
+diffHg19=$(echo "scale=2; (($newCountHg19 - $oldCountHg19) / $oldCountHg19) * 100" | bc)
+
+# Get the absolute values of the differences
+absDiffHg38=$(echo "$diffHg38" | sed 's/-//')
+absDiffHg19=$(echo "$diffHg19" | sed 's/-//')
+
+# Check if the absolute difference is greater than 20%
+if (( $(echo "$absDiffHg38 > 20" | bc -l) || $(echo "$absDiffHg19 > 20" | bc -l) )); then
+    echo
+    echo "Error: Difference in item count exceeds 20%."
+    echo "Difference in hg38: $diffHg38%"
+    echo "Difference in hg19: $diffHg19%"
+    exit 1
+fi
 
-newCountHg38=$(bigBedInfo clinGenCspecHg38.bb | grep -i "itemCount")
-newCountHg19=$(bigBedInfo clinGenCspecHg19.bb | grep -i "itemCount")
+# If the difference is within the 20%, proceed
+mv clinGenCspecHg38.new.bb clinGenCspecHg38.bb
+mv clinGenCspecHg19.new.bb clinGenCspecHg19.bb
+mv svis.json svis.json.old
 
 echo
-echo Item counts for hg38 old vs. new bigBed. Old: $oldCountHg38 New: $newCountHg38
-echo Item counts for hg19 old vs. new bigBed. Old: $oldCountHg19 New: $newCountHg19
+echo "Item counts for hg38 old vs. new bigBed. Old: $oldCountHg38 New: $newCountHg38"
+echo "Item counts for hg19 old vs. new bigBed. Old: $oldCountHg19 New: $newCountHg19"
 echo
-echo ClinGen VCEP specifications track built successfully.
+echo "ClinGen VCEP specifications track built successfully."