d3375c5d2c3d482c5739743c5548553b298ebf62 lrnassar Wed Feb 25 05:40:03 2026 -0800 Adding 10% diff check to GenCC which didn't have it, and a bad file from GenCC broke the track. No RM. diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py index 75ca7859b73..645eca1de1f 100644 --- src/hg/utils/otto/genCC/doGenCC.py +++ src/hg/utils/otto/genCC/doGenCC.py @@ -1,323 +1,337 @@ #!/usr/bin/env python3 #Made otto by Lou 9/14/2023 import subprocess import csv import re +import sys from datetime import datetime def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) def getLatestGencodeTable(db, pattern): """Find the latest GENCODE table matching a pattern like 'wgEncodeGencodeComp%' or 'wgEncodeGencodeComp%lift37'. Returns the table name with the highest version number.""" output = bash('hgsql -Ne "show tables like \''+pattern+'\'" '+db) tables = output.rstrip().split("\n") tables = [t for t in tables if t] # filter empty strings if not tables: raise RuntimeError("No GENCODE tables found matching '"+pattern+"' in "+db) # Extract version number from each table name and find the max best = None bestVersion = -1 for t in tables: match = re.search(r'V(\d+)', t) if match: version = int(match.group(1)) if version > bestVersion: bestVersion = version best = t if best is None: raise RuntimeError("Could not parse version from GENCODE tables in "+db) print("Using GENCODE table: "+best+" (version "+str(bestVersion)+") for "+db) return(best) def fetchGeneInfo(geneSymbol, table, dbs): """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene. Groups results by chromosome and picks the chromosome with the most transcripts to avoid mixing coordinates from multi-copy gene families.""" GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name2='"""+geneSymbol+"""'" """+dbs) GENCODEoutput = GENCODEoutput.rstrip().split("\n") try: # Group transcripts by chromosome, filtering out fix/alt/hap chromGroups = {} for geneEntry in GENCODEoutput: if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry): fields = geneEntry.split("\t") chrom = fields[1] if chrom not in chromGroups: chromGroups[chrom] = [] chromGroups[chrom].append(fields) if not chromGroups: print("Following gene symbol was not found: "+geneSymbol) return(None) # Pick the chromosome with the most transcripts bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) transcripts = chromGroups[bestChrom] finalGeneOutput = {} if len(transcripts) == 1: finalGeneOutput['ensTranscript'] = transcripts[0][0] else: finalGeneOutput['ensTranscript'] = "" finalGeneOutput['chrom'] = bestChrom finalGeneOutput['strand'] = transcripts[0][2] finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) return(finalGeneOutput) except Exception as e: print("Following gene symbol was not found: "+geneSymbol+" ("+str(e)+")") return(None) def fetchGeneInfoHg19(nmAccession, table, dbs): """Performs mysql search on ncbiRefSeq table and retrieves largest txStart/txEnd for entire gene. Groups results by chromosome and picks the chromosome with the most transcripts.""" GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name='"""+nmAccession+"""'" """+dbs) GENCODEoutput = GENCODEoutput.rstrip().split("\n") try: # Group transcripts by chromosome, filtering out fix/alt/hap chromGroups = {} for geneEntry in GENCODEoutput: if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry): fields = geneEntry.split("\t") chrom = fields[1] if chrom not in chromGroups: chromGroups[chrom] = [] chromGroups[chrom].append(fields) if not chromGroups: print("Following refSeq accession was not found: "+nmAccession) return(None) # Pick the chromosome with the most transcripts bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) transcripts = chromGroups[bestChrom] finalGeneOutput = {} if len(transcripts) == 1: finalGeneOutput['ensTranscript'] = transcripts[0][0] else: finalGeneOutput['ensTranscript'] = "" finalGeneOutput['chrom'] = bestChrom finalGeneOutput['strand'] = transcripts[0][2] finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) return(finalGeneOutput) except Exception as e: print("Following refSeq accession was not found: "+nmAccession+" ("+str(e)+")") return(None) def fetchGeneInfoKnownGene(geneSymbol, dbs): """Fallback lookup using knownGene + kgXref join. Catches genes missing from GENCODE Comprehensive (pseudogenes, recently added genes, etc.).""" output = bash("""hgsql -Ne "select kg.name,kg.chrom,kg.strand,kg.txStart,kg.txEnd from knownGene kg, kgXref kx where kg.name=kx.kgID and kx.geneSymbol='"""+geneSymbol+"""'" """+dbs) output = output.rstrip().split("\n") try: chromGroups = {} for entry in output: if "fix" not in str(entry) and "alt" not in str(entry) and "hap" not in str(entry): fields = entry.split("\t") chrom = fields[1] if chrom not in chromGroups: chromGroups[chrom] = [] chromGroups[chrom].append(fields) if not chromGroups: return(None) bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) transcripts = chromGroups[bestChrom] finalGeneOutput = {} if len(transcripts) == 1: finalGeneOutput['ensTranscript'] = transcripts[0][0] else: finalGeneOutput['ensTranscript'] = "" finalGeneOutput['chrom'] = bestChrom finalGeneOutput['strand'] = transcripts[0][2] finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) return(finalGeneOutput) except Exception: return(None) def makeManeDic(): """Iterates through latest mane file and builds dictionary out of gene symbols and values""" bash("bigBedToBed /gbdb/hg38/mane/mane.bb /hive/data/outside/otto/genCC/mane.bed") maneGeneSymbolCoords = {} with open("/hive/data/outside/otto/genCC/mane.bed",'r') as openFile: for line in openFile: line = line.rstrip() line = line.split("\t") geneSymbol = line[18] if geneSymbol not in maneGeneSymbolCoords: maneGeneSymbolCoords[geneSymbol] = {} maneGeneSymbolCoords[geneSymbol]['chrom'] = line[0] maneGeneSymbolCoords[geneSymbol]['chromStart'] = line[1] maneGeneSymbolCoords[geneSymbol]['chromEnd'] = line[2] maneGeneSymbolCoords[geneSymbol]['strand'] = line[5] maneGeneSymbolCoords[geneSymbol]['ensGene'] = line[17] maneGeneSymbolCoords[geneSymbol]['ensTranscript'] = line[3] maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] = line[21] bash("rm -f /hive/data/outside/otto/genCC/mane.bed") return(maneGeneSymbolCoords) def assignRGBcolorByEvidenceClassification(classification): if classification == "Definitive": classificationRgb = "56,161,105" elif classification == "Disputed Evidence": classificationRgb = "229,62,62" elif classification == "Limited": classificationRgb = "252,129,129" elif classification == "Moderate": classificationRgb = "104,211,145" elif classification == "No Known Disease Relationship": classificationRgb = "113,128,150" elif classification == "Refuted Evidence": classificationRgb = "155,44,44" elif classification == "Strong": classificationRgb = "39,103,73" elif classification == "Supportive": classificationRgb = "99,179,237" else: print("There was an unknown classification: "+classification) classificationRgb = "0,0,0" return(classificationRgb) def writeHg38Line(outputFile, chrom, chromStart, chromEnd, genCCname, strand, classificationRgb, ensTranscript, ensGene, refSeqAccession, line): """Write a single hg38 bed line, validating coordinates first.""" if int(chromStart) > int(chromEnd): print("Skipping record with inverted coordinates: "+genCCname+" "+chrom+":"+chromStart+"-"+chromEnd) return(False) # Sanitize fields: replace embedded newlines and tabs with spaces for bed format compatibility sanitized = [field.replace("\n", " ").replace("\r", " ").replace("\t", " ") for field in line] outputFile.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname, strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(sanitized))) return(True) def buildFileHg38(genCCfile, outPutFile, gencodeTable): n=0 maneGeneSymbolCoords = makeManeDic() with open(genCCfile, newline='', encoding="utf-8") as inputGenCCFile, \ open(outPutFile, 'w', encoding='utf-8') as outputHg38File: reader = csv.reader(inputGenCCFile, delimiter='\t', quotechar='"') header = next(reader) for line in reader: if len(line) != 31: print("Skipping malformed row with "+str(len(line))+" fields: "+str(line[:3])) continue geneSymbol = line[3] genCCname = line[3]+" "+line[6] classificationRgb = assignRGBcolorByEvidenceClassification(line[9]) try: chrom = maneGeneSymbolCoords[geneSymbol]['chrom'] chromStart = maneGeneSymbolCoords[geneSymbol]['chromStart'] chromEnd = maneGeneSymbolCoords[geneSymbol]['chromEnd'] strand = maneGeneSymbolCoords[geneSymbol]['strand'] ensGene = maneGeneSymbolCoords[geneSymbol]['ensGene'] ensTranscript = maneGeneSymbolCoords[geneSymbol]['ensTranscript'] refSeqAccession = maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] writeHg38Line(outputHg38File, chrom, chromStart, chromEnd, genCCname, strand, classificationRgb, ensTranscript, ensGene, refSeqAccession, line) except Exception: # Fallback 1: GENCODE Comprehensive geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg38') if geneDic is not None: wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'], geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb, geneDic['ensTranscript'], "", "", line) if wrote: continue # Fallback 2: knownGene + kgXref geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg38') if geneDic is not None: wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'], geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb, geneDic['ensTranscript'], "", "", line) if wrote: continue n+=1 print("No hg38 match for: "+geneSymbol) print("\nhg38 genCC bed file completed. Total number of failed entries: "+str(n)) def buildFileHg19(genCCfile, outPutFile, gencodeTable): n=0 skippedCoords=0 with open(genCCfile, 'r', encoding="utf-8") as hg38GenCCbedFile, \ open(outPutFile, 'w', encoding='utf-8') as outputHg19File: for line in hg38GenCCbedFile: line = line.rstrip() line = line.split("\t") geneSymbol = line[15] nmAccession = line[11] geneDic = None if nmAccession != "": geneDic = fetchGeneInfoHg19(nmAccession, 'ncbiRefSeq', 'hg19') if geneDic is None: geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg19') if geneDic is None: geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg19') if geneDic is None: n+=1 print("No hg19 match for: "+geneSymbol) continue chrom = geneDic['chrom'] chromStart = geneDic['txStart'] chromEnd = geneDic['txEnd'] # Validate coordinates before writing if int(chromStart) > int(chromEnd): skippedCoords+=1 print("Skipping hg19 record with inverted coordinates: "+geneSymbol+" "+chrom+":"+chromStart+"-"+chromEnd) continue outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd, "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:]))) print("hg19 genCC bed file completed. Total number of failed entries: "+str(n)) if skippedCoords > 0: print("Warning: "+str(skippedCoords)+" hg19 entries skipped due to inverted coordinates") def checkIfUpdateIsNeeded(): bash("wget -q 'https://thegencc.org/download/action/submissions-export-tsv?format=new' -O /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = newMd5sum.split(" ")[0] oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv") oldMd5sum = oldMd5sum.split(" ")[0] if oldMd5sum != newMd5sum: return(True) else: return(False) +def checkItemCounts(oldBb, newBb): + """Compare item counts between old and new bigBed files. Exit if difference > 10%.""" + oldItemCount = bash('bigBedInfo '+oldBb+' | grep "itemCount"') + oldItemCount = int(oldItemCount.rstrip().split("itemCount: ")[1].replace(",","")) + newItemCount = bash('bigBedInfo '+newBb+' | grep "itemCount"') + newItemCount = int(newItemCount.rstrip().split("itemCount: ")[1].replace(",","")) + if abs(newItemCount - oldItemCount) > 0.1 * max(newItemCount, oldItemCount): + sys.exit("Item count difference >10% for "+newBb+": old="+str(oldItemCount)+" new="+str(newItemCount)) + print(oldBb+" old: "+str(oldItemCount)+" new: "+str(newItemCount)) + if checkIfUpdateIsNeeded(): date = str(datetime.now()).split(" ")[0] workDir = "/hive/data/outside/otto/genCC/"+date bash("mkdir -p "+workDir) hg19outPutFile = workDir+"/hg19genCC.bed" hg38outPutFile = workDir+"/hg38genCC.bed" bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir) genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv" # Auto-detect latest GENCODE Comprehensive tables gencodeHg38 = getLatestGencodeTable('hg38', 'wgEncodeGencodeComp%') gencodeHg19 = getLatestGencodeTable('hg19', 'wgEncodeGencodeComp%lift37') buildFileHg38(genCCtsvFile, hg38outPutFile, gencodeHg38) buildFileHg19(hg38outPutFile, hg19outPutFile, gencodeHg19) bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv") bash("bedSort "+hg38outPutFile+" "+hg38outPutFile) bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb") print("Final file created: "+hg38outPutFile.split(".")[0]+".bb") bash("bedSort "+hg19outPutFile+" "+hg19outPutFile) bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb") print("Final file created: "+hg19outPutFile.split(".")[0]+".bb") + checkItemCounts("/gbdb/hg38/bbi/genCC.bb", hg38outPutFile.split(".")[0]+".bb") + checkItemCounts("/gbdb/hg19/bbi/genCC.bb", hg19outPutFile.split(".")[0]+".bb") + bash("rm -f /gbdb/hg38/bbi/genCC.bb") bash("rm -f /gbdb/hg19/bbi/genCC.bb") bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb") bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb") print("GenCC updated.")