d3375c5d2c3d482c5739743c5548553b298ebf62 lrnassar Wed Feb 25 05:40:03 2026 -0800 Adding 10% diff check to GenCC which didn't have it, and a bad file from GenCC broke the track. No RM. diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py index 75ca7859b73..645eca1de1f 100644 --- src/hg/utils/otto/genCC/doGenCC.py +++ src/hg/utils/otto/genCC/doGenCC.py @@ -1,21 +1,22 @@ #!/usr/bin/env python3 #Made otto by Lou 9/14/2023 import subprocess import csv import re +import sys from datetime import datetime def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) def getLatestGencodeTable(db, pattern): """Find the latest GENCODE table matching a pattern like 'wgEncodeGencodeComp%' or 'wgEncodeGencodeComp%lift37'. Returns the table name with the highest version number.""" @@ -277,47 +278,60 @@ print("hg19 genCC bed file completed. Total number of failed entries: "+str(n)) if skippedCoords > 0: print("Warning: "+str(skippedCoords)+" hg19 entries skipped due to inverted coordinates") def checkIfUpdateIsNeeded(): bash("wget -q 'https://thegencc.org/download/action/submissions-export-tsv?format=new' -O /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = newMd5sum.split(" ")[0] oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv") oldMd5sum = oldMd5sum.split(" ")[0] if oldMd5sum != newMd5sum: return(True) else: return(False) +def checkItemCounts(oldBb, newBb): + """Compare item counts between old and new bigBed files. Exit if difference > 10%.""" + oldItemCount = bash('bigBedInfo '+oldBb+' | grep "itemCount"') + oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",","")) + newItemCount = bash('bigBedInfo '+newBb+' | grep "itemCount"') + newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",","")) + if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount): + sys.exit("Item count difference >10% for "+newBb+": old="+str(oldItemCount)+" new="+str(newItemCount)) + print(oldBb+" old: "+str(oldItemCount)+" new: "+str(newItemCount)) + if checkIfUpdateIsNeeded(): date = str(datetime.now()).split(" ")[0] workDir = "/hive/data/outside/otto/genCC/"+date bash("mkdir -p "+workDir) hg19outPutFile = workDir+"/hg19genCC.bed" hg38outPutFile = workDir+"/hg38genCC.bed" bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir) genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv" # Auto-detect latest GENCODE Comprehensive tables gencodeHg38 = getLatestGencodeTable('hg38', 'wgEncodeGencodeComp%') gencodeHg19 = getLatestGencodeTable('hg19', 'wgEncodeGencodeComp%lift37') buildFileHg38(genCCtsvFile, hg38outPutFile, gencodeHg38) buildFileHg19(hg38outPutFile, hg19outPutFile, gencodeHg19) bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv") bash("bedSort "+hg38outPutFile+" "+hg38outPutFile) bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb") print("Final file created: "+hg38outPutFile.split(".")[0]+".bb") bash("bedSort "+hg19outPutFile+" "+hg19outPutFile) bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb") print("Final file created: "+hg19outPutFile.split(".")[0]+".bb") + checkItemCounts("/gbdb/hg38/bbi/genCC.bb", hg38outPutFile.split(".")[0]+".bb") + checkItemCounts("/gbdb/hg19/bbi/genCC.bb", hg19outPutFile.split(".")[0]+".bb") + bash("rm -f /gbdb/hg38/bbi/genCC.bb") bash("rm -f /gbdb/hg19/bbi/genCC.bb") bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb") bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb") print("GenCC updated.")