src/hg/makeDb/scripts/buildDosageSensitivityCollins.py efbc896ec61ccbd94910a56f5baae6d35f93cd19

efbc896ec61ccbd94910a56f5baae6d35f93cd19
lrnassar
  Wed Dec 20 10:40:28 2023 -0800
Staging QA Ready the dosage sensitivity track from Collins et al 2022 for hg38 and hg19. Refs #31991

diff --git src/hg/makeDb/scripts/buildDosageSensitivityCollins.py src/hg/makeDb/scripts/buildDosageSensitivityCollins.py
new file mode 100644
index 0000000..1716ea2
--- /dev/null
+++ src/hg/makeDb/scripts/buildDosageSensitivityCollins.py
@@ -0,0 +1,270 @@
+import subprocess
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+def splitFiles(pathToFile,dbs):
+    originalFile = open(pathToFile,'r')
+    outputFileHaplo = open("/hive/data/outside/collinsHaploTriploScores/pHaploDosageSensitivity"+dbs+".bed",'w')
+    outputFileTriplo = open("/hive/data/outside/collinsHaploTriploScores/pTriploDosageSensitivity"+dbs+".bed",'w')
+    
+    for line in originalFile:
+        line = line.rstrip()
+        line = line.split("\t")
+        pHaplo = round(float(line[10]),2)
+        pTriplo = round(float(line[11]),2)
+        #Color according to paper: https://europepmc.org/article/MED/35917817
+        if pHaplo >= 0.86:
+            pHaploItemColor = "181,2,14"
+        else:
+            pHaploItemColor = "250,42,27"
+        if pTriplo >= 0.94:
+            pTriploItemColor = "0,9,138"
+        else:
+            pTriploItemColor = "87,92,252"
+        outputFileHaplo.write("\t".join(line[0:8])+"\t"+pHaploItemColor+"\t"+line[9]+"\t"+str(pHaplo)+"\n")
+        outputFileTriplo.write("\t".join(line[0:8])+"\t"+pTriploItemColor+"\t"+line[9]+"\t"+str(pTriplo)+"\n")
+    originalFile.close()
+    outputFileHaplo.close()
+    outputFileTriplo.close()
+    
+def fetchGeneInfo(geneSymbol,ensVersion,dbs):
+    """Performs mysql search on GENCODE table and retrieves largest txStart/txEnd for entire gene"""
+    buildGene = {}
+    finalGeneOutput = {}
+    buildGene['txStart'] = []
+    buildGene['txEnd'] = [] 
+    GENCODEoutput = bash("""hgsql -Ne "select name,chrom,strand,txStart,txEnd from wgEncodeGencodeComp"""+ensVersion+""" where name2='"""+geneSymbol+"""'" """+dbs)
+    GENCODEoutput = GENCODEoutput.rstrip().split("\n")
+    try:
+        for geneEntry in GENCODEoutput:
+            if "fix" not in str(geneEntry) and "alt" not in str(geneEntry) and "hap" not in str(geneEntry) and "_gl" not in str(geneEntry):
+                geneEntry = geneEntry.split("\t")
+                finalGeneOutput['ensTranscript'] = geneEntry[0]
+                
+                #Look for ENSTs across difference chromosomes, like HIST1H4J
+                try:
+                    if buildGene['chrom'] != geneEntry[1]:
+                        print("Multi-chromosome gene: "+geneSymbol)
+                except:
+                    pass
+                
+                #Make special exception for HIST1H4J on hg38 that has transcripts on chr1 and chr6
+                if geneSymbol == "HIST1H4J" and dbs == "hg38":
+                    buildGene['chrom'] = "chr1"
+                    buildGene['strand'] = "+"
+                    buildGene['txStart'].append(int(149832658))
+                    buildGene['txEnd'].append(int(149839767))
+                else:
+                    buildGene['chrom'] = geneEntry[1]
+                    buildGene['strand'] = geneEntry[2]
+                    buildGene['txStart'].append(int(geneEntry[3]))
+                    buildGene['txEnd'].append(int(geneEntry[4]))
+                    
+        finalGeneOutput['chrom'] = buildGene['chrom']
+        finalGeneOutput['strand'] = buildGene['strand']
+        finalGeneOutput['txStart'] = str(min(buildGene['txStart']))
+        finalGeneOutput['txEnd'] = str(max(buildGene['txEnd']))
+        #try to get the ENSGid
+        try:
+            ensgFetch = bash("""hgsql -Ne "select geneId from wgEncodeGencodeAttrs"""+ensVersion+""" where transcriptId='"""+finalGeneOutput['ensTranscript']+"""'" """+dbs)
+            finalGeneOutput['ensgID'] = ensgFetch.rstrip()
+        except:
+            print("Error with table. Look at wgEncodeGencodeAttrs"+ensVersion)
+            
+        return(finalGeneOutput)
+    except:
+        pass
+
+def makeManeDic():
+    """Iterates through latest mane file and builds dictionary out of gene symbols and values"""
+    bash("bigBedToBed /gbdb/hg38/mane/mane.bb /hive/data/outside/collinsHaploTriploScores/mane.bed")
+    openFile = open("/hive/data/outside/collinsHaploTriploScores/mane.bed",'r')
+    maneGeneSymbolCoords = {}
+    for line in openFile:
+        line = line.rstrip()
+        line = line.split("\t")
+        geneSymbol = line[18]
+        if geneSymbol not in maneGeneSymbolCoords.keys():
+            maneGeneSymbolCoords[geneSymbol] = {}
+            maneGeneSymbolCoords[geneSymbol]['chrom'] = line[0]
+            maneGeneSymbolCoords[geneSymbol]['chromStart'] = line[1]
+            maneGeneSymbolCoords[geneSymbol]['chromEnd'] = line[2]
+            maneGeneSymbolCoords[geneSymbol]['strand'] = line[5]
+            maneGeneSymbolCoords[geneSymbol]['ensGene'] = line[17]
+            maneGeneSymbolCoords[geneSymbol]['ensTranscript'] = line[3]
+            maneGeneSymbolCoords[geneSymbol]['refSeqAccession'] = line[21]
+    openFile.close()
+    bash("rm -f /hive/data/outside/collinsHaploTriploScores/mane.bed")
+    return(maneGeneSymbolCoords)
+
+def buildFileHg38(dosageScoresFile,outPutFile,missingGenesFromHg38):
+    inputDosageScoresFile = open(dosageScoresFile,'r')
+    outputHg38File = open(outPutFile,'w')
+    missingGenesFromHg38 = open(missingGenesFromHg38, 'w')
+    n=0
+    problem = 0
+    maneGeneSymbolCoords = makeManeDic()
+    badItems = []
+    missingGenes = []
+    for line in inputDosageScoresFile:
+        if n % 1000 == 0:
+            print("Proccessing line "+str(n)+" of file.")
+        n+=1
+        if line.startswith('#'):
+            continue
+        else:
+            line = line.rstrip()
+            line = line.split("\t")
+            if len(line) == 3:            
+                geneSymbol = line[0]
+                classificationRgb = "252,129,129"
+                try:
+                    chrom = maneGeneSymbolCoords[geneSymbol]['chrom']
+                    chromStart = maneGeneSymbolCoords[geneSymbol]['chromStart']
+                    chromEnd = maneGeneSymbolCoords[geneSymbol]['chromEnd']
+                    strand = maneGeneSymbolCoords[geneSymbol]['strand']
+                    ensGene = maneGeneSymbolCoords[geneSymbol]['ensGene']
+                    outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,geneSymbol,
+                        strand,chromStart,chromEnd,classificationRgb,ensGene,"\t".join(line[1:])))
+                except:
+                    try:
+                        geneDic = fetchGeneInfo(geneSymbol,'V45','hg38')
+                        chrom = geneDic['chrom']
+                        chromStart = geneDic['txStart']
+                        chromEnd = geneDic['txEnd']
+                        strand = geneDic['strand']
+                        ensGene = geneDic['ensgID']
+                        outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,geneSymbol,
+                            strand,chromStart,chromEnd,classificationRgb,ensGene,"\t".join(line[1:])))
+                    except:
+                        try:
+                            geneDic = fetchGeneInfo(geneSymbol,'V20','hg38')
+                            chrom = geneDic['chrom']
+                            chromStart = geneDic['txStart']
+                            chromEnd = geneDic['txEnd']
+                            strand = geneDic['strand']
+                            ensGene = geneDic['ensgID']
+                            outputHg38File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,geneSymbol,
+                                strand,chromStart,chromEnd,classificationRgb,ensGene,"\t".join(line[1:])))
+                
+                        except:      
+                            missingGenes.append(geneSymbol)
+                            problem+=1
+            else:
+                badItems.append(line)
+
+    inputDosageScoresFile.close()
+    outputHg38File.close()
+    if badItems != []:
+        print(str(len(badItems))+" bad lines were found in the file, see below:\n\n")
+        for item in badItems:
+            print(item)
+    for item in missingGenes:
+        missingGenesFromHg38.write(item+"\n")
+    missingGenesFromHg38.close()
+    print("\n\nhg38 bed file completed. See file missingHg38GenesList.txt. Total number of failed entries: "+str(problem))
+    print("\n\nhg38 bed file completed. Total number of failed entries: "+str(problem))    
+
+def buildFileHg19(dosageScoresFile,outPutFile):
+    inputDosageScoresFile = open(dosageScoresFile,'r')
+    outputHg19File = open(outPutFile,'w')
+    n=0
+    problem=0
+    badItems = []
+    for line in inputDosageScoresFile:
+        if n % 1000 == 0:
+            print("Proccessing line "+str(n)+" of file.")
+        n+=1
+        if line.startswith('#'):
+            continue
+        else:
+            line = line.rstrip()
+            line = line.split("\t")
+            if len(line) == 3:            
+                geneSymbol = line[0]
+                pHaplo = float(line[1])
+                pTriplo = float(line[2])
+                classificationRgb = "252,129,129"
+                try:
+                    geneDic = fetchGeneInfo(geneSymbol,'V19','hg19')
+                    chrom = geneDic['chrom']
+                    chromStart = geneDic['txStart']
+                    chromEnd = geneDic['txEnd']
+                    strand = geneDic['strand']
+                    ensTranscript = geneDic['ensTranscript']
+                    ensgID = geneDic['ensgID']
+                    outputHg19File.write("%s\t%s\t%s\t%s\t0\t%s\t%s\t%s\t%s\t%s\t%s\n" % (chrom,chromStart,chromEnd,geneSymbol,
+                            strand,chromStart,chromEnd,classificationRgb,ensgID,"\t".join(line[1:])))                
+                except:
+                    print("Problem entry.")
+                    problem+=1
+            else:
+                badItems.append(line)
+
+    inputDosageScoresFile.close()
+    outputHg19File.close()
+    if badItems != []:
+        print(str(len(badItems))+" bad lines were found in the file, see below:\n\n")
+        for item in badItems:
+            print(item)
+    if problem != 0:
+        print("\n\nhg19 genCC bed file completed. Total number of mismapped genes:: "+str(problem))
+
+#Start building track
+workDir = "/hive/data/outside/collinsHaploTriploScores"
+bash("mkdir -p "+workDir)
+hg19outPutFile = workDir+"/hg19dosageScores.bed"
+hg38outPutFile = workDir+"/hg38dosageScores.bed"
+missingGenesFromHg38 = workDir+"/missingHg38GenesList.txt"
+dosageScoresTsvFile = "/hive/data/outside/collinsHaploTriploScores/rCNV.gene_scores.tsv"
+
+buildFileHg19(dosageScoresTsvFile,hg19outPutFile)
+buildFileHg38(dosageScoresTsvFile,hg38outPutFile,missingGenesFromHg38)
+
+#build file of missing genes and liftOver the missing hg38 genes from hg19
+bash("cat "+hg19outPutFile+" | grep -w -f /hive/data/outside/collinsHaploTriploScores/missingHg38GenesList.txt > /hive/data/outside/collinsHaploTriploScores/finalMissingHg38Genes.bed")
+bash("liftOver -bedPlus=9 -tab /hive/data/outside/collinsHaploTriploScores/finalMissingHg38Genes.bed /hive/data/outside/collinsHaploTriploScores/hg19ToHg38.over.chain /hive/data/outside/collinsHaploTriploScores/hg38missingGenes.bed /hive/data/outside/collinsHaploTriploScores/unmapped.bed")
+finalGeneCountNotMappedToHg38 = bash("wc -l /hive/data/outside/collinsHaploTriploScores/unmapped.bed")
+print("The total number of files that were not able to be mapped by any means to hg38 were "+str(int(finalGeneCountNotMappedToHg38.split(" ")[0])/2))
+print("Most of these regions were split in hg38, see unmapped file for details: /hive/data/outside/collinsHaploTriploScores/unmapped.bed")
+
+#Add the lifted entries to the hg38 file
+hg38outPutFile = workDir+"/hg38dosageScoresFinal.bed"
+bash("cat /hive/data/outside/collinsHaploTriploScores/hg38dosageScores.bed > "+hg38outPutFile)
+bash("cat /hive/data/outside/collinsHaploTriploScores/hg38missingGenes.bed >> "+hg38outPutFile)
+
+#Sort the files
+bash("bedSort "+hg38outPutFile+" "+hg38outPutFile)
+bash("bedSort "+hg19outPutFile+" "+hg19outPutFile)
+
+#Split the files into two separate tracks for each score and add color for items
+splitFiles(hg38outPutFile,"Hg38")
+splitFiles(hg19outPutFile,"Hg19")
+
+#Create hg38 bb file and symlink
+hg38pHaploFinalFile = "/hive/data/outside/collinsHaploTriploScores/pHaploDosageSensitivityHg38.bed"
+hg38pTriploFinalFile = "/hive/data/outside/collinsHaploTriploScores/pTriploDosageSensitivityHg38.bed"
+bash("bedToBigBed -as=/hive/data/outside/collinsHaploTriploScores/dosageSensitivityHaplo.as -extraIndex=name -type=bed9+2 -tab "+hg38pHaploFinalFile+" /cluster/data/hg38/chrom.sizes "+hg38pHaploFinalFile.split(".")[0]+".bb")
+bash("bedToBigBed -as=/hive/data/outside/collinsHaploTriploScores/dosageSensitivityTriplo.as -extraIndex=name -type=bed9+2 -tab "+hg38pTriploFinalFile+" /cluster/data/hg38/chrom.sizes "+hg38pTriploFinalFile.split(".")[0]+".bb")
+
+bash("ln -s "+hg38pHaploFinalFile.split(".")[0]+".bb /gbdb/hg38/bbi/dosageSensitivityCollins2022/pHaploDosageSensitivity.bb")
+bash("ln -s "+hg38pTriploFinalFile.split(".")[0]+".bb /gbdb/hg38/bbi/dosageSensitivityCollins2022/pTriploDosageSensitivity.bb")
+
+#Create hg19 bb file and symlink
+hg19pHaploFinalFile = "/hive/data/outside/collinsHaploTriploScores/pHaploDosageSensitivityHg19.bed"
+hg19pTriploFinalFile = "/hive/data/outside/collinsHaploTriploScores/pTriploDosageSensitivityHg19.bed"
+bash("bedToBigBed -as=/hive/data/outside/collinsHaploTriploScores/dosageSensitivityHaplo.as -extraIndex=name -type=bed9+2 -tab "+hg19pHaploFinalFile+" /cluster/data/hg19/chrom.sizes "+hg19pHaploFinalFile.split(".")[0]+".bb")
+bash("bedToBigBed -as=/hive/data/outside/collinsHaploTriploScores/dosageSensitivityTriplo.as -extraIndex=name -type=bed9+2 -tab "+hg19pTriploFinalFile+" /cluster/data/hg19/chrom.sizes "+hg19pTriploFinalFile.split(".")[0]+".bb")
+
+bash("ln -s "+hg19pHaploFinalFile.split(".")[0]+".bb /gbdb/hg19/bbi/dosageSensitivityCollins2022/pHaploDosageSensitivity.bb")
+bash("ln -s "+hg19pTriploFinalFile.split(".")[0]+".bb /gbdb/hg19/bbi/dosageSensitivityCollins2022/pTriploDosageSensitivity.bb")
+
+print("DosageSensitivity track complete.")