src/hg/utils/otto/genCC/doGenCC.py 7bf5a5c3241a66ba15f9763638a04d072f4b9b76

7bf5a5c3241a66ba15f9763638a04d072f4b9b76
jnavarr5
  Fri Apr 24 12:22:11 2026 -0700
Moving the 'if args.force' condition to the begining of the function since the md5sum checks are not needed if we are going to run the update manually. Addressing feedback from code review, refs #37417

diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py
index 501a6c28cec..bcedda22fd2 100644
--- src/hg/utils/otto/genCC/doGenCC.py
+++ src/hg/utils/otto/genCC/doGenCC.py
@@ -1,353 +1,353 @@
 #!/usr/bin/env python3
 #Made otto by Lou 9/14/2023
 
 import argparse
 import subprocess
 import csv
 import re
 import sys
 from datetime import datetime
 
 parser = argparse.ArgumentParser(description='Build and update GenCC track data.')
 parser.add_argument('--force', action='store_true',
                     help='Continue even if item count difference is more than 10%%')
 args = parser.parse_args()
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def getLatestGencodeTable(db, pattern):
     """Find the latest GENCODE table matching a pattern like 'wgEncodeGencodeComp%' or
     'wgEncodeGencodeComp%lift37'. Returns the table name with the highest version number."""
     output = bash('hgsql -Ne "show tables like \''+pattern+'\'" '+db)
     tables = output.rstrip().split("\n")
     tables = [t for t in tables if t]  # filter empty strings
     if not tables:
         raise RuntimeError("No GENCODE tables found matching '"+pattern+"' in "+db)
     # Extract version number from each table name and find the max
     best = None
     bestVersion = -1
     for t in tables:
         match = re.search(r'V(\d+)', t)
         if match:
             version = int(match.group(1))
             if version > bestVersion:
                 bestVersion = version
                 best = t
     if best is None:
         raise RuntimeError("Could not parse version from GENCODE tables in "+db)
     print("Using GENCODE table: "+best+" (version "+str(bestVersion)+") for "+db)
     return(best)
 
 def fetchGeneInfo(geneSymbol, table, dbs):
     """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene.
     Groups results by chromosome and picks the chromosome with the most transcripts to avoid
     mixing coordinates from multi-copy gene families."""
     GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name2='"""+geneSymbol+"""'" """+dbs)
     GENCODEoutput = GENCODEoutput.rstrip().split("\n")
     try:
         # Group transcripts by chromosome, filtering out fix/alt/hap
         chromGroups = {}
         for geneEntry in GENCODEoutput:
             if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry):
                 fields = geneEntry.split("\t")
                 chrom = fields[1]
                 if chrom not in chromGroups:
                     chromGroups[chrom] = []
                 chromGroups[chrom].append(fields)
         if not chromGroups:
             print("Following gene symbol was not found: "+geneSymbol)
             return(None)
         # Pick the chromosome with the most transcripts
         bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c]))
         transcripts = chromGroups[bestChrom]
         finalGeneOutput = {}
         if len(transcripts) == 1:
             finalGeneOutput['ensTranscript'] = transcripts[0][0]
         else:
             finalGeneOutput['ensTranscript'] = ""
         finalGeneOutput['chrom'] = bestChrom
         finalGeneOutput['strand'] = transcripts[0][2]
         finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts))
         finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts))
         return(finalGeneOutput)
     except Exception as e:
         print("Following gene symbol was not found: "+geneSymbol+" ("+str(e)+")")
         return(None)
 
 def fetchGeneInfoHg19(nmAccession, table, dbs):
     """Performs mysql search on ncbiRefSeq table and retrieves largest txStart/txEnd for entire gene.
     Groups results by chromosome and picks the chromosome with the most transcripts."""
     GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name='"""+nmAccession+"""'" """+dbs)
     GENCODEoutput = GENCODEoutput.rstrip().split("\n")
     try:
         # Group transcripts by chromosome, filtering out fix/alt/hap
         chromGroups = {}
         for geneEntry in GENCODEoutput:
             if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry):
                 fields = geneEntry.split("\t")
                 chrom = fields[1]
                 if chrom not in chromGroups:
                     chromGroups[chrom] = []
                 chromGroups[chrom].append(fields)
         if not chromGroups:
             print("Following refSeq accession was not found: "+nmAccession)
             return(None)
         # Pick the chromosome with the most transcripts
         bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c]))
         transcripts = chromGroups[bestChrom]
         finalGeneOutput = {}
         if len(transcripts) == 1:
             finalGeneOutput['ensTranscript'] = transcripts[0][0]
         else:
             finalGeneOutput['ensTranscript'] = ""
         finalGeneOutput['chrom'] = bestChrom
         finalGeneOutput['strand'] = transcripts[0][2]
         finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts))
         finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts))
         return(finalGeneOutput)
     except Exception as e:
         print("Following refSeq accession was not found: "+nmAccession+" ("+str(e)+")")
         return(None)
 
 def fetchGeneInfoKnownGene(geneSymbol, dbs):
     """Fallback lookup using knownGene + kgXref join. Catches genes missing from GENCODE
     Comprehensive (pseudogenes, recently added genes, etc.)."""
     output = bash("""hgsql -Ne "select kg.name,kg.chrom,kg.strand,kg.txStart,kg.txEnd from knownGene kg, kgXref kx where kg.name=kx.kgID and kx.geneSymbol='"""+geneSymbol+"""'" """+dbs)
     output = output.rstrip().split("\n")
     try:
         chromGroups = {}
         for entry in output:
             if "fix" not in str(entry) and "alt" not in str(entry) and "hap" not in str(entry):
                 fields = entry.split("\t")
                 chrom = fields[1]
                 if chrom not in chromGroups:
                     chromGroups[chrom] = []
                 chromGroups[chrom].append(fields)
         if not chromGroups:
             return(None)
         bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c]))
         transcripts = chromGroups[bestChrom]
         finalGeneOutput = {}
         if len(transcripts) == 1:
             finalGeneOutput['ensTranscript'] = transcripts[0][0]
         else:
             finalGeneOutput['ensTranscript'] = ""
         finalGeneOutput['chrom'] = bestChrom
         finalGeneOutput['strand'] = transcripts[0][2]
         finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts))
         finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts))
         return(finalGeneOutput)
     except Exception:
         return(None)
 
 def makeManeDic():
     """Iterates through latest mane file and builds dictionary out of gene symbols and values"""
     bash("bigBedToBed /gbdb/hg38/mane/mane.bb /hive/data/outside/otto/genCC/mane.bed")
     maneGeneSymbolCoords = {}
     with open("/hive/data/outside/otto/genCC/mane.bed",'r') as openFile:
         for line in openFile:
             line = line.rstrip()
             line = line.split("\t")
             geneSymbol = line[18]
             if geneSymbol not in maneGeneSymbolCoords:
                 maneGeneSymbolCoords[geneSymbol] = {}
                 maneGeneSymbolCoords[geneSymbol]['chrom'] = line[0]
                 maneGeneSymbolCoords[geneSymbol]['chromStart'] = line[1]
                 maneGeneSymbolCoords[geneSymbol]['chromEnd'] = line[2]
                 maneGeneSymbolCoords[geneSymbol]['strand'] = line[5]
                 maneGeneSymbolCoords[geneSymbol]['ensGene'] = line[17]
                 maneGeneSymbolCoords[geneSymbol]['ensTranscript'] = line[3]
                 maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] = line[21]
     bash("rm -f /hive/data/outside/otto/genCC/mane.bed")
     return(maneGeneSymbolCoords)
 
 def assignRGBcolorByEvidenceClassification(classification):
     if classification == "Definitive":
         classificationRgb = "56,161,105"
     elif classification == "Disputed Evidence":
         classificationRgb = "229,62,62"
     elif classification == "Limited":
         classificationRgb = "252,129,129"
     elif classification == "Moderate":
         classificationRgb = "104,211,145"
     elif classification == "No Known Disease Relationship":
         classificationRgb = "113,128,150"
     elif classification == "Refuted Evidence":
         classificationRgb = "155,44,44"
     elif classification == "Strong":
         classificationRgb = "39,103,73"
     elif classification == "Supportive":
         classificationRgb = "99,179,237"
     else:
         print("There was an unknown classification: "+classification)
         classificationRgb = "0,0,0"
     return(classificationRgb)
 
 def writeHg38Line(outputFile, chrom, chromStart, chromEnd, genCCname, strand,
                   classificationRgb, ensTranscript, ensGene, refSeqAccession, line):
     """Write a single hg38 bed line, validating coordinates first."""
     if int(chromStart) > int(chromEnd):
         print("Skipping record with inverted coordinates: "+genCCname+" "+chrom+":"+chromStart+"-"+chromEnd)
         return(False)
     # Sanitize fields: replace embedded newlines and tabs with spaces for bed format compatibility
     sanitized = [field.replace("\n", " ").replace("\r", " ").replace("\t", " ") for field in line]
     outputFile.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname,
         strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(sanitized)))
     return(True)
 
 def buildFileHg38(genCCfile, outPutFile, gencodeTable):
     n=0
     maneGeneSymbolCoords = makeManeDic()
     with open(genCCfile, newline='', encoding="utf-8") as inputGenCCFile, \
          open(outPutFile, 'w', encoding='utf-8') as outputHg38File:
         reader = csv.reader(inputGenCCFile, delimiter='\t', quotechar='"')
         header = next(reader)
         for line in reader:
             if len(line) != 31:
                 print("Skipping malformed row with "+str(len(line))+" fields: "+str(line[:3]))
                 continue
             geneSymbol = line[3]
             genCCname = line[3]+" "+line[6]
             classificationRgb = assignRGBcolorByEvidenceClassification(line[9])
             try:
                 chrom = maneGeneSymbolCoords[geneSymbol]['chrom']
                 chromStart = maneGeneSymbolCoords[geneSymbol]['chromStart']
                 chromEnd = maneGeneSymbolCoords[geneSymbol]['chromEnd']
                 strand = maneGeneSymbolCoords[geneSymbol]['strand']
                 ensGene = maneGeneSymbolCoords[geneSymbol]['ensGene']
                 ensTranscript = maneGeneSymbolCoords[geneSymbol]['ensTranscript']
                 refSeqAccession = maneGeneSymbolCoords[geneSymbol]['refSeqAccession']
                 writeHg38Line(outputHg38File, chrom, chromStart, chromEnd, genCCname,
                     strand, classificationRgb, ensTranscript, ensGene, refSeqAccession, line)
             except Exception:
                 # Fallback 1: GENCODE Comprehensive
                 geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg38')
                 if geneDic is not None:
                     wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'],
                         geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb,
                         geneDic['ensTranscript'], "", "", line)
                     if wrote:
                         continue
                 # Fallback 2: knownGene + kgXref
                 geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg38')
                 if geneDic is not None:
                     wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'],
                         geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb,
                         geneDic['ensTranscript'], "", "", line)
                     if wrote:
                         continue
                 n+=1
                 print("No hg38 match for: "+geneSymbol)
 
     print("\nhg38 genCC bed file completed. Total number of failed entries: "+str(n))
 
 def buildFileHg19(genCCfile, outPutFile, gencodeTable):
     n=0
     skippedCoords=0
     with open(genCCfile, 'r', encoding="utf-8") as hg38GenCCbedFile, \
          open(outPutFile, 'w', encoding='utf-8') as outputHg19File:
         for line in hg38GenCCbedFile:
             line = line.rstrip()
             line = line.split("\t")
             geneSymbol = line[15]
             nmAccession = line[11]
             geneDic = None
             if nmAccession != "":
                 geneDic = fetchGeneInfoHg19(nmAccession, 'ncbiRefSeq', 'hg19')
             if geneDic is None:
                 geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg19')
             if geneDic is None:
                 geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg19')
             if geneDic is None:
                 n+=1
                 print("No hg19 match for: "+geneSymbol)
                 continue
             chrom = geneDic['chrom']
             chromStart = geneDic['txStart']
             chromEnd = geneDic['txEnd']
             # Validate coordinates before writing
             if int(chromStart) > int(chromEnd):
                 skippedCoords+=1
                 print("Skipping hg19 record with inverted coordinates: "+geneSymbol+" "+chrom+":"+chromStart+"-"+chromEnd)
                 continue
             outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,
                                     "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:])))
 
     print("hg19 genCC bed file completed. Total number of failed entries: "+str(n))
     if skippedCoords > 0:
         print("Warning: "+str(skippedCoords)+" hg19 entries skipped due to inverted coordinates")
 
 def checkIfUpdateIsNeeded():
     bash("wget -q 'https://thegencc.org/download/action/submissions-export-tsv?format=new' -O /hive/data/outside/otto/genCC/newSubmission.tsv")
+    if args.force:
+        # Previous file matches current file, but update didn't go through
+        # because of itemCounts check.
+        return(True)
     newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv")
     newMd5sum = newMd5sum.split("  ")[0]
     oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv")
     oldMd5sum = oldMd5sum.split("  ")[0]
     if oldMd5sum != newMd5sum:
         return(True)
-    elif args.force:
-        # Previous file matches current file, but update didn't go through
-        # because of itemCounts check.
-        return(True)
     else:
         return(False)
 
 def checkItemCounts(oldBb, newBb):
     """Compare item counts between old and new bigBed files. Exit if difference > 10%."""
     oldItemCount = bash('bigBedInfo '+oldBb+' | grep "itemCount"')
     oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
     newItemCount = bash('bigBedInfo '+newBb+' | grep "itemCount"')
     newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",",""))
     if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount):
         msg = "WARNING:\nItem count difference >10% for "+newBb+":\nold="+str(oldItemCount)+" new="+str(newItemCount)
         print(msg)
         if args.force:
             print("(continuing due to QA approval)")
         else:
             sys.exit("Run ./doGenCC.py --force if you approve the item count change.")
     print(oldBb+" old: "+str(oldItemCount)+" new: "+str(newItemCount))
 
 if checkIfUpdateIsNeeded():
     date = str(datetime.now()).split(" ")[0]
     workDir = "/hive/data/outside/otto/genCC/"+date
     bash("mkdir -p "+workDir)
     hg19outPutFile = workDir+"/hg19genCC.bed"
     hg38outPutFile = workDir+"/hg38genCC.bed"
     bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir)
     genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv"
 
     # Auto-detect latest GENCODE Comprehensive tables
     gencodeHg38 = getLatestGencodeTable('hg38', 'wgEncodeGencodeComp%')
     gencodeHg19 = getLatestGencodeTable('hg19', 'wgEncodeGencodeComp%lift37')
 
     buildFileHg38(genCCtsvFile, hg38outPutFile, gencodeHg38)
     buildFileHg19(hg38outPutFile, hg19outPutFile, gencodeHg19)
 
     bash("bedSort "+hg38outPutFile+" "+hg38outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg38outPutFile.split(".")[0]+".bb")
 
     bash("bedSort "+hg19outPutFile+" "+hg19outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg19outPutFile.split(".")[0]+".bb")
 
     checkItemCounts("/gbdb/hg38/bbi/genCC.bb", hg38outPutFile.split(".")[0]+".bb")
     checkItemCounts("/gbdb/hg19/bbi/genCC.bb", hg19outPutFile.split(".")[0]+".bb")
 
     bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv")
 
     bash("rm -f /gbdb/hg38/bbi/genCC.bb")
     bash("rm -f /gbdb/hg19/bbi/genCC.bb")
 
     bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb")
     bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb")
 
     print("GenCC updated.")