1331fff6434827b6c138f9e18f3b5491917c63b0 lrnassar Fri Feb 20 17:31:34 2026 -0800 Many updates to the GenCC pipeline. 1 It was using GencodeV40 hardcoded; now it uses the latest GENCODE table. 2. Better handling of the comment field of the tsv, which used to include even HTML, so I wasn't parsing it correctly, leading to some items being excluded from the track. 3. Fixed a bug where I was treating numbers as strings, leading to a few corner cases (only came up in this latest GenCC update) where the coordinates were wrong. 4. GenCC emailed us that they are updating their format, much to our chagrin, since we usually just extend fields at the end to not break pipelines. I just adopted their new format, which changed the UUID to a versioned ID and added a version number field. So, overall, the field count went up by 1. 5. Handle multi-chrom genes (only a few) by picking the chrom with the most transcripts. 6. Add a fallback to look up genes missing from GENCODE by checking kgXref. Refs #37135 diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py index a898b31c08f..75ca7859b73 100644 --- src/hg/utils/otto/genCC/doGenCC.py +++ src/hg/utils/otto/genCC/doGenCC.py @@ -1,256 +1,323 @@ #!/usr/bin/env python3 #Made otto by Lou 9/14/2023 import subprocess +import csv +import re from datetime import datetime def bash(cmd): """Run the cmd in bash subprocess""" try: rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) bashStdoutt = rawBashOutput.stdout except subprocess.CalledProcessError as e: raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) return(bashStdoutt) +def getLatestGencodeTable(db, pattern): + """Find the latest GENCODE table matching a pattern like 'wgEncodeGencodeComp%' or + 'wgEncodeGencodeComp%lift37'. Returns the table name with the highest version number.""" + output = bash('hgsql -Ne "show tables like \''+pattern+'\'" '+db) + tables = output.rstrip().split("\n") + tables = [t for t in tables if t] # filter empty strings + if not tables: + raise RuntimeError("No GENCODE tables found matching '"+pattern+"' in "+db) + # Extract version number from each table name and find the max + best = None + bestVersion = -1 + for t in tables: + match = re.search(r'V(\d+)', t) + if match: + version = int(match.group(1)) + if version > bestVersion: + bestVersion = version + best = t + if best is None: + raise RuntimeError("Could not parse version from GENCODE tables in "+db) + print("Using GENCODE table: "+best+" (version "+str(bestVersion)+") for "+db) + return(best) + def fetchGeneInfo(geneSymbol, table, dbs): - """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene""" - buildGene = {} - finalGeneOutput = {} - buildGene['txStart'] = [] - buildGene['txEnd'] = [] + """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene. + Groups results by chromosome and picks the chromosome with the most transcripts to avoid + mixing coordinates from multi-copy gene families.""" GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name2='"""+geneSymbol+"""'" """+dbs) GENCODEoutput = GENCODEoutput.rstrip().split("\n") try: + # Group transcripts by chromosome, filtering out fix/alt/hap + chromGroups = {} for geneEntry in GENCODEoutput: if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry): - geneEntry = geneEntry.split("\t") - if len(GENCODEoutput) == 1: - finalGeneOutput['ensTranscript'] = geneEntry[0] + fields = geneEntry.split("\t") + chrom = fields[1] + if chrom not in chromGroups: + chromGroups[chrom] = [] + chromGroups[chrom].append(fields) + if not chromGroups: + print("Following gene symbol was not found: "+geneSymbol) + return(None) + # Pick the chromosome with the most transcripts + bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) + transcripts = chromGroups[bestChrom] + finalGeneOutput = {} + if len(transcripts) == 1: + finalGeneOutput['ensTranscript'] = transcripts[0][0] else: finalGeneOutput['ensTranscript'] = "" - buildGene['chrom'] = geneEntry[1] - buildGene['strand'] = geneEntry[2] - buildGene['txStart'].append(geneEntry[3]) - buildGene['txEnd'].append(geneEntry[4]) - finalGeneOutput['chrom'] = buildGene['chrom'] - finalGeneOutput['strand'] = buildGene['strand'] - finalGeneOutput['txStart'] = min(buildGene['txStart']) - finalGeneOutput['txEnd'] = max(buildGene['txEnd']) + finalGeneOutput['chrom'] = bestChrom + finalGeneOutput['strand'] = transcripts[0][2] + finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) + finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) return(finalGeneOutput) - except: - print("Following gene symbol was not found:"+ geneSymbol) + except Exception as e: + print("Following gene symbol was not found: "+geneSymbol+" ("+str(e)+")") + return(None) def fetchGeneInfoHg19(nmAccession, table, dbs): - """Performs mysql search on ncbiRefSeq table and retrieves largest txStart/txEnd for entire gene""" - buildGene = {} - finalGeneOutput = {} - buildGene['txStart'] = [] - buildGene['txEnd'] = [] + """Performs mysql search on ncbiRefSeq table and retrieves largest txStart/txEnd for entire gene. + Groups results by chromosome and picks the chromosome with the most transcripts.""" GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name='"""+nmAccession+"""'" """+dbs) GENCODEoutput = GENCODEoutput.rstrip().split("\n") try: + # Group transcripts by chromosome, filtering out fix/alt/hap + chromGroups = {} for geneEntry in GENCODEoutput: if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry): - geneEntry = geneEntry.split("\t") - if len(GENCODEoutput) == 1: - finalGeneOutput['ensTranscript'] = geneEntry[0] + fields = geneEntry.split("\t") + chrom = fields[1] + if chrom not in chromGroups: + chromGroups[chrom] = [] + chromGroups[chrom].append(fields) + if not chromGroups: + print("Following refSeq accession was not found: "+nmAccession) + return(None) + # Pick the chromosome with the most transcripts + bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) + transcripts = chromGroups[bestChrom] + finalGeneOutput = {} + if len(transcripts) == 1: + finalGeneOutput['ensTranscript'] = transcripts[0][0] else: finalGeneOutput['ensTranscript'] = "" - buildGene['chrom'] = geneEntry[1] - buildGene['strand'] = geneEntry[2] - buildGene['txStart'].append(geneEntry[3]) - buildGene['txEnd'].append(geneEntry[4]) - finalGeneOutput['chrom'] = buildGene['chrom'] - finalGeneOutput['strand'] = buildGene['strand'] - finalGeneOutput['txStart'] = min(buildGene['txStart']) - finalGeneOutput['txEnd'] = max(buildGene['txEnd']) + finalGeneOutput['chrom'] = bestChrom + finalGeneOutput['strand'] = transcripts[0][2] + finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) + finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) return(finalGeneOutput) - except: - print("Following refSeq accession was not found:"+ nmAccession) + except Exception as e: + print("Following refSeq accession was not found: "+nmAccession+" ("+str(e)+")") + return(None) + +def fetchGeneInfoKnownGene(geneSymbol, dbs): + """Fallback lookup using knownGene + kgXref join. Catches genes missing from GENCODE + Comprehensive (pseudogenes, recently added genes, etc.).""" + output = bash("""hgsql -Ne "select kg.name,kg.chrom,kg.strand,kg.txStart,kg.txEnd from knownGene kg, kgXref kx where kg.name=kx.kgID and kx.geneSymbol='"""+geneSymbol+"""'" """+dbs) + output = output.rstrip().split("\n") + try: + chromGroups = {} + for entry in output: + if "fix" not in str(entry) and "alt" not in str(entry) and "hap" not in str(entry): + fields = entry.split("\t") + chrom = fields[1] + if chrom not in chromGroups: + chromGroups[chrom] = [] + chromGroups[chrom].append(fields) + if not chromGroups: + return(None) + bestChrom = max(chromGroups, key=lambda c: len(chromGroups[c])) + transcripts = chromGroups[bestChrom] + finalGeneOutput = {} + if len(transcripts) == 1: + finalGeneOutput['ensTranscript'] = transcripts[0][0] + else: + finalGeneOutput['ensTranscript'] = "" + finalGeneOutput['chrom'] = bestChrom + finalGeneOutput['strand'] = transcripts[0][2] + finalGeneOutput['txStart'] = str(min(int(t[3]) for t in transcripts)) + finalGeneOutput['txEnd'] = str(max(int(t[4]) for t in transcripts)) + return(finalGeneOutput) + except Exception: + return(None) def makeManeDic(): """Iterates through latest mane file and builds dictionary out of gene symbols and values""" bash("bigBedToBed /gbdb/hg38/mane/mane.bb /hive/data/outside/otto/genCC/mane.bed") - openFile = open("/hive/data/outside/otto/genCC/mane.bed",'r') maneGeneSymbolCoords = {} + with open("/hive/data/outside/otto/genCC/mane.bed",'r') as openFile: for line in openFile: line = line.rstrip() line = line.split("\t") geneSymbol = line[18] - if geneSymbol not in maneGeneSymbolCoords.keys(): + if geneSymbol not in maneGeneSymbolCoords: maneGeneSymbolCoords[geneSymbol] = {} maneGeneSymbolCoords[geneSymbol]['chrom'] = line[0] maneGeneSymbolCoords[geneSymbol]['chromStart'] = line[1] maneGeneSymbolCoords[geneSymbol]['chromEnd'] = line[2] maneGeneSymbolCoords[geneSymbol]['strand'] = line[5] maneGeneSymbolCoords[geneSymbol]['ensGene'] = line[17] maneGeneSymbolCoords[geneSymbol]['ensTranscript'] = line[3] maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] = line[21] - openFile.close() bash("rm -f /hive/data/outside/otto/genCC/mane.bed") return(maneGeneSymbolCoords) def assignRGBcolorByEvidenceClassification(classification): if classification == "Definitive": classificationRgb = "56,161,105" elif classification == "Disputed Evidence": classificationRgb = "229,62,62" elif classification == "Limited": classificationRgb = "252,129,129" elif classification == "Moderate": classificationRgb = "104,211,145" elif classification == "No Known Disease Relationship": classificationRgb = "113,128,150" elif classification == "Refuted Evidence": classificationRgb = "155,44,44" elif classification == "Strong": classificationRgb = "39,103,73" elif classification == "Supportive": classificationRgb = "99,179,237" else: print("There was an unknown classification: "+classification) + classificationRgb = "0,0,0" return(classificationRgb) -def buildFileHg38(genCCfile,outPutFile): - inputGenCCFile = open(genCCfile,'r',encoding="utf-8") - outputHg38File = open(outPutFile,'w',encoding='utf-8') +def writeHg38Line(outputFile, chrom, chromStart, chromEnd, genCCname, strand, + classificationRgb, ensTranscript, ensGene, refSeqAccession, line): + """Write a single hg38 bed line, validating coordinates first.""" + if int(chromStart) > int(chromEnd): + print("Skipping record with inverted coordinates: "+genCCname+" "+chrom+":"+chromStart+"-"+chromEnd) + return(False) + # Sanitize fields: replace embedded newlines and tabs with spaces for bed format compatibility + sanitized = [field.replace("\n", " ").replace("\r", " ").replace("\t", " ") for field in line] + outputFile.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname, + strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(sanitized))) + return(True) + +def buildFileHg38(genCCfile, outPutFile, gencodeTable): n=0 maneGeneSymbolCoords = makeManeDic() - badItems = [] - for line in inputGenCCFile: - if line.startswith('"uuid'): + with open(genCCfile, newline='', encoding="utf-8") as inputGenCCFile, \ + open(outPutFile, 'w', encoding='utf-8') as outputHg38File: + reader = csv.reader(inputGenCCFile, delimiter='\t', quotechar='"') + header = next(reader) + for line in reader: + if len(line) != 31: + print("Skipping malformed row with "+str(len(line))+" fields: "+str(line[:3])) continue - else: - line = line.rstrip() - line = line.split("\t") - if len(line) == 30: - for each in range(len(line)): - line[each] = line[each][1:len(line[each])-1] - geneSymbol = line[2] -# genCCname = line[0] #Disease = 4 GENCC = 7 evidence = 8 - genCCname = line[2]+" "+line[5] - classificationRgb = assignRGBcolorByEvidenceClassification(line[8]) + geneSymbol = line[3] + genCCname = line[3]+" "+line[6] + classificationRgb = assignRGBcolorByEvidenceClassification(line[9]) try: chrom = maneGeneSymbolCoords[geneSymbol]['chrom'] chromStart = maneGeneSymbolCoords[geneSymbol]['chromStart'] chromEnd = maneGeneSymbolCoords[geneSymbol]['chromEnd'] strand = maneGeneSymbolCoords[geneSymbol]['strand'] ensGene = maneGeneSymbolCoords[geneSymbol]['ensGene'] ensTranscript = maneGeneSymbolCoords[geneSymbol]['ensTranscript'] refSeqAccession = maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] - outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname, - strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(line))) - except: - try: - geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40','hg38') - chrom = geneDic['chrom'] - chromStart = geneDic['txStart'] - chromEnd = geneDic['txEnd'] - strand = geneDic['strand'] - ensGene = "" - ensTranscript = geneDic['ensTranscript'] - refSeqAccession = "" - outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname, - strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(line))) - except: + writeHg38Line(outputHg38File, chrom, chromStart, chromEnd, genCCname, + strand, classificationRgb, ensTranscript, ensGene, refSeqAccession, line) + except Exception: + # Fallback 1: GENCODE Comprehensive + geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg38') + if geneDic is not None: + wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'], + geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb, + geneDic['ensTranscript'], "", "", line) + if wrote: + continue + # Fallback 2: knownGene + kgXref + geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg38') + if geneDic is not None: + wrote = writeHg38Line(outputHg38File, geneDic['chrom'], geneDic['txStart'], + geneDic['txEnd'], genCCname, geneDic['strand'], classificationRgb, + geneDic['ensTranscript'], "", "", line) + if wrote: + continue n+=1 - else: - badItems.append(line) + print("No hg38 match for: "+geneSymbol) - inputGenCCFile.close() - outputHg38File.close() - print(str(len(badItems))+" lines were skipped in the file because they have incorrect formatting. This is usually because they had newline and tab characters in the info column. Usually fewer than 60 items are skipped this way. If the number is much greater, verify the script and update the estimate. The lines are printed below.\n\n") - print("Note: This part of the code was commented out to reduce spam. Uncomment to see the lines.") - #for item in badItems: - # print("\t".join(item).encode('utf-8')) - print("\n\nhg38 genCC bed file completed. Total number of failed entries: "+str(n)) + print("\nhg38 genCC bed file completed. Total number of failed entries: "+str(n)) -def buildFileHg19(genCCfile,outPutFile): - hg38GenCCbedFile = open(genCCfile,'r',encoding="utf-8") - outputHg19File = open(outPutFile,'w',encoding='utf-8') +def buildFileHg19(genCCfile, outPutFile, gencodeTable): n=0 + skippedCoords=0 + with open(genCCfile, 'r', encoding="utf-8") as hg38GenCCbedFile, \ + open(outPutFile, 'w', encoding='utf-8') as outputHg19File: for line in hg38GenCCbedFile: line = line.rstrip() line = line.split("\t") - geneSymbol = line[14] + geneSymbol = line[15] nmAccession = line[11] + geneDic = None if nmAccession != "": - try: geneDic = fetchGeneInfoHg19(nmAccession, 'ncbiRefSeq', 'hg19') - chrom = geneDic['chrom'] - chromStart = geneDic['txStart'] - chromEnd = geneDic['txEnd'] - strand = geneDic['strand'] - outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd, - "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:]))) - - except: - try: - geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40lift37','hg19') - chrom = geneDic['chrom'] - chromStart = geneDic['txStart'] - chromEnd = geneDic['txEnd'] - strand = geneDic['strand'] - outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd, - "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:]))) - - except: + if geneDic is None: + geneDic = fetchGeneInfo(geneSymbol, gencodeTable, 'hg19') + if geneDic is None: + geneDic = fetchGeneInfoKnownGene(geneSymbol, 'hg19') + if geneDic is None: n+=1 - print("No match for: "+geneSymbol) - else: - try: - geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40lift37','hg19') + print("No hg19 match for: "+geneSymbol) + continue chrom = geneDic['chrom'] chromStart = geneDic['txStart'] chromEnd = geneDic['txEnd'] - strand = geneDic['strand'] + # Validate coordinates before writing + if int(chromStart) > int(chromEnd): + skippedCoords+=1 + print("Skipping hg19 record with inverted coordinates: "+geneSymbol+" "+chrom+":"+chromStart+"-"+chromEnd) + continue outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd, "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:]))) - except: - n+=1 - print("No match for: "+geneSymbol) - hg38GenCCbedFile.close() - outputHg19File.close() + print("hg19 genCC bed file completed. Total number of failed entries: "+str(n)) + if skippedCoords > 0: + print("Warning: "+str(skippedCoords)+" hg19 entries skipped due to inverted coordinates") def checkIfUpdateIsNeeded(): - bash("wget -q https://search.thegencc.org/download/action/submissions-export-tsv -O /hive/data/outside/otto/genCC/newSubmission.tsv") + bash("wget -q 'https://thegencc.org/download/action/submissions-export-tsv?format=new' -O /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv") newMd5sum = newMd5sum.split(" ")[0] oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv") oldMd5sum = oldMd5sum.split(" ")[0] if oldMd5sum != newMd5sum: return(True) else: return(False) if checkIfUpdateIsNeeded(): date = str(datetime.now()).split(" ")[0] workDir = "/hive/data/outside/otto/genCC/"+date bash("mkdir -p "+workDir) hg19outPutFile = workDir+"/hg19genCC.bed" hg38outPutFile = workDir+"/hg38genCC.bed" bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir) genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv" - buildFileHg38(genCCtsvFile,hg38outPutFile) - buildFileHg19(hg38outPutFile,hg19outPutFile) + # Auto-detect latest GENCODE Comprehensive tables + gencodeHg38 = getLatestGencodeTable('hg38', 'wgEncodeGencodeComp%') + gencodeHg19 = getLatestGencodeTable('hg19', 'wgEncodeGencodeComp%lift37') + + buildFileHg38(genCCtsvFile, hg38outPutFile, gencodeHg38) + buildFileHg19(hg38outPutFile, hg19outPutFile, gencodeHg19) bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv") bash("bedSort "+hg38outPutFile+" "+hg38outPutFile) - bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+33 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb") + bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb") print("Final file created: "+hg38outPutFile.split(".")[0]+".bb") bash("bedSort "+hg19outPutFile+" "+hg19outPutFile) - bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+33 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb") + bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -sort -extraIndex=gene_symbol -type=bed9+34 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb") print("Final file created: "+hg19outPutFile.split(".")[0]+".bb") bash("rm -f /gbdb/hg38/bbi/genCC.bb") bash("rm -f /gbdb/hg19/bbi/genCC.bb") bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb") bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb") print("GenCC updated.")