src/hg/utils/otto/genCC/doGenCC.py 76d933ccf24bd1a2a360f9b6a437c2e8445c7583

76d933ccf24bd1a2a360f9b6a437c2e8445c7583
lrnassar
  Wed Oct 18 11:01:51 2023 -0700
Fixing up a small encoding bug in the GenCC otto job and clarification of the printed message. Refs #31795

diff --git src/hg/utils/otto/genCC/doGenCC.py src/hg/utils/otto/genCC/doGenCC.py
index eafee9e..a099d72 100644
--- src/hg/utils/otto/genCC/doGenCC.py
+++ src/hg/utils/otto/genCC/doGenCC.py
@@ -1,254 +1,254 @@
 #Made otto by Lou 9/14/2023
 
 import subprocess
 from datetime import datetime
 
 def bash(cmd):
     """Run the cmd in bash subprocess"""
     try:
         rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
                                        stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
         bashStdoutt = rawBashOutput.stdout
     except subprocess.CalledProcessError as e:
         raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
     return(bashStdoutt)
 
 def fetchGeneInfo(geneSymbol,table,dbs):
     """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene"""
     buildGene = {}
     finalGeneOutput = {}
     buildGene['txStart'] = []
     buildGene['txEnd'] = []
     GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name2='"""+geneSymbol+"""'" """+dbs)
     GENCODEoutput = GENCODEoutput.rstrip().split("\n")
     try:
         for geneEntry in GENCODEoutput:
             if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry):
                 geneEntry = geneEntry.split("\t")
                 if len(GENCODEoutput) == 1:
                     finalGeneOutput['ensTranscript'] = geneEntry[0]
                 else:
                     finalGeneOutput['ensTranscript'] = ""
                 buildGene['chrom'] = geneEntry[1]
                 buildGene['strand'] = geneEntry[2]
                 buildGene['txStart'].append(geneEntry[3])
                 buildGene['txEnd'].append(geneEntry[4])
         finalGeneOutput['chrom'] = buildGene['chrom']
         finalGeneOutput['strand'] = buildGene['strand']
         finalGeneOutput['txStart'] = min(buildGene['txStart'])
         finalGeneOutput['txEnd'] = max(buildGene['txEnd'])
         return(finalGeneOutput)
     except:
         print("Following gene symbol was not found:"+ geneSymbol)
 
 def fetchGeneInfoHg19(nmAccession,table,dbs):
     """Performs mysql search on ncbiRefSeq table and retrieves largest txStart/txEnd for entire gene"""
     buildGene = {}
     finalGeneOutput = {}
     buildGene['txStart'] = []
     buildGene['txEnd'] = []
     GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from """+table+""" where name='"""+nmAccession+"""'" """+dbs)
     GENCODEoutput = GENCODEoutput.rstrip().split("\n")
     try:
         for geneEntry in GENCODEoutput:
             if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry):
                 geneEntry = geneEntry.split("\t")
                 if len(GENCODEoutput) == 1:
                     finalGeneOutput['ensTranscript'] = geneEntry[0]
                 else:
                     finalGeneOutput['ensTranscript'] = ""
                 buildGene['chrom'] = geneEntry[1]
                 buildGene['strand'] = geneEntry[2]
                 buildGene['txStart'].append(geneEntry[3])
                 buildGene['txEnd'].append(geneEntry[4])
         finalGeneOutput['chrom'] = buildGene['chrom']
         finalGeneOutput['strand'] = buildGene['strand']
         finalGeneOutput['txStart'] = min(buildGene['txStart'])
         finalGeneOutput['txEnd'] = max(buildGene['txEnd'])
         return(finalGeneOutput)
     except:
         print("Following refSeq accession was not found:"+ nmAccession)
 
 def makeManeDic():
     """Iterates through latest mane  file and builds dictionary out of gene symbols and values"""
     bash("bigBedToBed /gbdb/hg38/mane/mane.bb /hive/data/outside/otto/genCC/mane.bed")
     openFile = open("/hive/data/outside/otto/genCC/mane.bed",'r')
     maneGeneSymbolCoords = {}
     for line in openFile:
         line = line.rstrip()
         line = line.split("\t")
         geneSymbol = line[18]
         if geneSymbol not in maneGeneSymbolCoords.keys():
             maneGeneSymbolCoords[geneSymbol] = {}
             maneGeneSymbolCoords[geneSymbol]['chrom'] = line[0]
             maneGeneSymbolCoords[geneSymbol]['chromStart'] = line[1]
             maneGeneSymbolCoords[geneSymbol]['chromEnd'] = line[2]
             maneGeneSymbolCoords[geneSymbol]['strand'] = line[5]
             maneGeneSymbolCoords[geneSymbol]['ensGene'] = line[17]
             maneGeneSymbolCoords[geneSymbol]['ensTranscript'] = line[3]
             maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] = line[21]
     openFile.close()
     bash("rm -f /hive/data/outside/otto/genCC/mane.bed")
     return(maneGeneSymbolCoords)
 
 def assignRGBcolorByEvidenceClassification(classification):
     if classification == "Definitive":
         classificationRgb = "39,103,73"
     elif classification == "Disputed Evidence":
         classificationRgb = "229,62,62"
     elif classification == "Limited":
         classificationRgb = "252,129,129"
     elif classification == "Moderate":
         classificationRgb = "104,211,145"
     elif classification == "No Known Disease Relationship":
         classificationRgb = "113,128,150"
     elif classification == "Refuted Evidence":
         classificationRgb = "155,44,44"
     elif classification == "Strong":
         classificationRgb = "56,161,105"
     elif classification == "Supportive":
         classificationRgb = "99,179,237"
     else:
         print("There was an unknown classification: "+classification)
     return(classificationRgb)
 
 def buildFileHg38(genCCfile,outPutFile):
     inputGenCCFile = open(genCCfile,'r',encoding="utf-8")
     outputHg38File = open(outPutFile,'w',encoding='utf-8')
     n=0
     maneGeneSymbolCoords = makeManeDic()
     badItems = []
     for line in inputGenCCFile:
         if line.startswith('"uuid'):
             continue
         else:
             line = line.rstrip()
             line = line.split("\t")
             if len(line) == 30:            
                 for each in range(len(line)):
                     line[each] = line[each][1:len(line[each])-1]
                 geneSymbol = line[2]
 #             genCCname = line[0] #Disease = 4 GENCC = 7 evidence = 8
                 genCCname = line[2]+" "+line[5]
                 classificationRgb = assignRGBcolorByEvidenceClassification(line[8])
                 try:
                     chrom = maneGeneSymbolCoords[geneSymbol]['chrom']
                     chromStart = maneGeneSymbolCoords[geneSymbol]['chromStart']
                     chromEnd = maneGeneSymbolCoords[geneSymbol]['chromEnd']
                     strand = maneGeneSymbolCoords[geneSymbol]['strand']
                     ensGene = maneGeneSymbolCoords[geneSymbol]['ensGene']
                     ensTranscript = maneGeneSymbolCoords[geneSymbol]['ensTranscript']
                     refSeqAccession = maneGeneSymbolCoords[geneSymbol]['refSeqAccession']
                     outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname,
                         strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(line)))
                 except:
                     try:
                         geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40','hg38')
                         chrom = geneDic['chrom']
                         chromStart = geneDic['txStart']
                         chromEnd = geneDic['txEnd']
                         strand = geneDic['strand']
                         ensGene = ""
                         ensTranscript = geneDic['ensTranscript']
                         refSeqAccession = ""
                         outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,genCCname,
                             strand,chromStart,chromEnd,classificationRgb,ensTranscript,ensGene,refSeqAccession,"\t".join(line)))
                     except:
                         n+=1
             else:
                 badItems.append(line)
 
     inputGenCCFile.close()
     outputHg38File.close()
-    print(str(len(badItems))+" lines were skipped in the file because they did not they have incorrect formatting. This is usually because they had newline and tab characters in the info column. Usually fewer than 60 items are skipped this way.\n\n")
+    print(str(len(badItems))+" lines were skipped in the file because they have incorrect formatting. This is usually because they had newline and tab characters in the info column. Usually fewer than 60 items are skipped this way. If the number is much greater, verify the script and update the estimate. The lines are printed below.\n\n")
     for item in badItems:
-        print(item)
+        print("\t".join(item).encode('utf-8'))
     print("\n\nhg38 genCC bed file completed. Total number of failed entries: "+str(n))
 
 def buildFileHg19(genCCfile,outPutFile):
     hg38GenCCbedFile = open(genCCfile,'r',encoding="utf-8")
     outputHg19File = open(outPutFile,'w',encoding='utf-8')
     n=0
     for line in hg38GenCCbedFile:
         line = line.rstrip()
         line = line.split("\t")
         geneSymbol = line[14]
         nmAccession = line[11]
         if nmAccession != "":
             try:
                 geneDic = fetchGeneInfoHg19(nmAccession,'ncbiRefSeq','hg19')
                 chrom = geneDic['chrom']
                 chromStart = geneDic['txStart']
                 chromEnd = geneDic['txEnd']
                 strand = geneDic['strand']
                 outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,
                                         "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:])))
 
             except:
                 try:
                     geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40lift37','hg19')
                     chrom = geneDic['chrom']
                     chromStart = geneDic['txStart']
                     chromEnd = geneDic['txEnd']
                     strand = geneDic['strand']
                     outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,
                                         "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:])))
 
                 except:
                     n+=1
                     print("No match for: "+geneSymbol)
         else:
             try:
                 geneDic = fetchGeneInfo(geneSymbol,'wgEncodeGencodeCompV40lift37','hg19')
                 chrom = geneDic['chrom']
                 chromStart = geneDic['txStart']
                 chromEnd = geneDic['txEnd']
                 strand = geneDic['strand']
                 outputHg19File.write("%s\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,
                                         "\t".join(line[3:6]),chromStart,chromEnd,"\t".join(line[8:])))
             except:
                 n+=1
                 print("No match for: "+geneSymbol)
     hg38GenCCbedFile.close()
     outputHg19File.close()
     print("hg19 genCC bed file completed. Total number of failed entries: "+str(n))
 
 def checkIfUpdateIsNeeded():
     bash("wget -q https://search.thegencc.org/download/action/submissions-export-tsv -O /hive/data/outside/otto/genCC/newSubmission.tsv")
     newMd5sum = bash("md5sum /hive/data/outside/otto/genCC/newSubmission.tsv")
     newMd5sum.split("  ")[0]
     oldMd5sum = bash("md5sum /hive/data/outside/otto/genCC/prevSubmission.tsv")
     oldMd5sum.split("  ")[0]
     if oldMd5sum != newMd5sum:
         return(True)
     else:
         return(False)
     
 if checkIfUpdateIsNeeded():
     date = str(datetime.now()).split(" ")[0]
     workDir = "/hive/data/outside/otto/genCC/"+date
     bash("mkdir -p "+workDir)
     hg19outPutFile = workDir+"/hg19genCC.bed"
     hg38outPutFile = workDir+"/hg38genCC.bed"
     bash("cp /hive/data/outside/otto/genCC/newSubmission.tsv "+workDir)
     genCCtsvFile = "/hive/data/outside/otto/genCC/newSubmission.tsv"
          
     buildFileHg38(genCCtsvFile,hg38outPutFile)
     buildFileHg19(hg38outPutFile,hg19outPutFile)
     bash("mv /hive/data/outside/otto/genCC/newSubmission.tsv /hive/data/outside/otto/genCC/prevSubmission.tsv")
 
     bash("bedSort "+hg38outPutFile+" "+hg38outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -extraIndex=gene_symbol -type=bed9+33 -tab "+hg38outPutFile+" /cluster/data/hg38/chrom.sizes "+hg38outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg38outPutFile.split(".")[0]+".bb")
 
     bash("bedSort "+hg19outPutFile+" "+hg19outPutFile)
     bash("bedToBigBed -as=/hive/data/genomes/hg38/bed/genCC/genCC.as -extraIndex=gene_symbol -type=bed9+33 -tab "+hg19outPutFile+" /cluster/data/hg19/chrom.sizes "+hg19outPutFile.split(".")[0]+".bb")
     print("Final file created: "+hg19outPutFile.split(".")[0]+".bb")
          
     bash("rm -f /gbdb/hg38/bbi/genCC.bb")
     bash("rm -f /gbdb/hg19/bbi/genCC.bb")
          
     bash("ln -s "+hg38outPutFile.split(".")[0]+".bb /gbdb/hg38/bbi/genCC.bb")
     bash("ln -s "+hg19outPutFile.split(".")[0]+".bb /gbdb/hg19/bbi/genCC.bb")
          
     print("GenCC updated.")