d3375c5d2c3d482c5739743c5548553b298ebf62
lrnassar
  Wed Feb 25 05:40:03 2026 -0800
Adding 10% diff check to GenCC which didn't have it, and a bad file from GenCC broke the track. No RM.

diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py
index 75ca7859b73..645eca1de1f 100644
--- src/hg/utils/otto/genCC/doGenCC.py
+++ src/hg/utils/otto/genCC/doGenCC.py
@@ -1,21 +1,22 @@
 #!/usr/bin/env python3
 #Made otto by Lou 9/14/2023
 
 import subprocess
 import csv
 import re
+import sys
 from datetime import datetime
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def getLatestGencodeTable(db, pattern):
     """Find the latest GENCODE table matching a pattern like 'wgEncodeGencodeComp%' or
     'wgEncodeGencodeComp%lift37'. Returns the table name with the highest version number."""
@@ -277,47 +278,60 @@
     print("hg19 genCC bed file completed. Total number of failed entries: "+str(n))
     if skippedCoords > 0:
         print("Warning: "+str(skippedCoords)+" hg19 entries skipped due to inverted coordinates")
 
 def checkIfUpdateIsNeeded():
     bash("wget -q 'https://thegencc.org/download/action/submissions-export-tsv?format=new' -O /hive/data/outside/otto/genCC/newSubmission.tsv")
     newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv")
     newMd5sum = newMd5sum.split("  ")[0]
     oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv")
     oldMd5sum = oldMd5sum.split("  ")[0]
     if oldMd5sum != newMd5sum:
         return(True)
     else:
         return(False)
 
+def checkItemCounts(oldBb, newBb):
+    """Compare item counts between old and new bigBed files. Exit if difference > 10%."""
+    oldItemCount = bash('bigBedInfo '+oldBb+' | grep "itemCount"')
+    oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
+    newItemCount = bash('bigBedInfo '+newBb+' | grep "itemCount"')
+    newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
+    if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount):
+        sys.exit("Item count difference >10% for "+newBb+": old="+str(oldItemCount)+" new="+str(newItemCount))
+    print(oldBb+" old: "+str(oldItemCount)+" new: "+str(newItemCount))
+
 if checkIfUpdateIsNeeded():
     date = str(datetime.now()).split(" ")[0]
     workDir = "/hive/data/outside/otto/genCC/"+date
     bash("mkdir -p "+workDir)
     hg19outPutFile = workDir+"/hg19genCC.bed"
     hg38outPutFile = workDir+"/hg38genCC.bed"
     bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir)
     genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv"
 
     # Auto-detect latest GENCODE Comprehensive tables
     gencodeHg38 = getLatestGencodeTable('hg38', 'wgEncodeGencodeComp%')
     gencodeHg19 = getLatestGencodeTable('hg19', 'wgEncodeGencodeComp%lift37')
 
     buildFileHg38(genCCtsvFile, hg38outPutFile, gencodeHg38)
     buildFileHg19(hg38outPutFile, hg19outPutFile, gencodeHg19)
     bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv")
 
     bash("bedSort "+hg38outPutFile+" "+hg38outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg38outPutFile.split(".")[0]+".bb")
 
     bash("bedSort "+hg19outPutFile+" "+hg19outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg19outPutFile.split(".")[0]+".bb")
 
+    checkItemCounts("/gbdb/hg38/bbi/genCC.bb", hg38outPutFile.split(".")[0]+".bb")
+    checkItemCounts("/gbdb/hg19/bbi/genCC.bb", hg19outPutFile.split(".")[0]+".bb")
+
     bash("rm -f /gbdb/hg38/bbi/genCC.bb")
     bash("rm -f /gbdb/hg19/bbi/genCC.bb")
 
     bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb")
     bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb")
 
     print("GenCC updated.")