e7bfc71ae21d7da0b0b5001c651d5ba1586e99c6
lrnassar
  Wed Jun 29 18:44:29 2022 -0700
Creating metaDome track, refs #23883

diff --git src/hg/makeDb/scripts/metaDome/buildMetaDomeTracks.py src/hg/makeDb/scripts/metaDome/buildMetaDomeTracks.py
new file mode 100644
index 0000000..b9b7c7f
--- /dev/null
+++ src/hg/makeDb/scripts/metaDome/buildMetaDomeTracks.py
@@ -0,0 +1,107 @@
+import subprocess
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+#Iterate through file and create dictionary combination of chrom+chromEnd with a list of data values
+#Reason for this is that a few (~.42%) of items have overlap. We want to take the smallest possible score
+#So as to have the highest sensitivity
+def parseFileBuildDic(metaDomeFile):
+    n=0
+    chrom=""
+    metaDome = {}
+    for line in metaDomeFile:
+    #chr1    69091   69093   +       sw_dn_ds:0.16666666666666669,sw_coverage:0.5238095238095238
+        if line.startswith("chrom"):
+            continue
+        else:
+            line = line.rstrip()
+            line = line.split("\t")
+        
+            if line[0] != chrom:
+                chrom = line[0]
+                chromStart = str(int(line[1])-1)
+                chromEnd = line[2]
+                dataValue = round(float(line[4].split(':')[1].split(",")[0]),2)
+                name = chrom+"\t"+chromEnd
+                if name not in metaDome.keys():
+                    metaDome[name]={}
+                    metaDome[name]['name']=chrom+"\t"+chromStart+"\t"+chromEnd
+                    metaDome[name]['dataValue']=[]
+                metaDome[name]['dataValue'].append(dataValue)
+            elif int(line[1]) <= int(chromEnd):
+                metaDome[name]['dataValue'].append(dataValue)
+                n+=1
+            else:
+                chrom = line[0]
+                chromStart = str(int(line[1])-1)
+                chromEnd = line[2]
+                dataValue = round(float(line[4].split(':')[1].split(",")[0]),2)
+                name = chrom+"\t"+chromEnd
+                if name not in metaDome.keys():
+                    metaDome[name]={}
+                    metaDome[name]['name']=chrom+"\t"+chromStart+"\t"+chromEnd
+                    metaDome[name]['dataValue']=[]
+                metaDome[name]['dataValue'].append(dataValue)
+    print("Total number of items in track: "+str(n))
+    return(metaDome)
+        
+
+# Iterate dictionary and write out results file
+def writeOutFile(metaDome,resultsFile):
+    for key, value in metaDome.items():
+        resultsFile.write(metaDome[key]['name']+"\t"+str(min(metaDome[key]['dataValue']))+"\n")
+
+#Create files
+def createBigWig(outputFile,bigWigFile):
+    bash("sort -k1,1 -k2,2n "+outputFile+" > "+outputFile+".sorted")
+    bash("bedGraphToBigWig "+outputFile+".sorted /hive/data/genomes/hg19/bed/metaDome/hg19.chrom.sizes "+bigWigFile)
+    
+def makeMetaDomeBigBed(metaDomeFile,bigBedOutputFile):
+    for line in metaDomeFile:
+    #chr1    69091   69093   +       sw_dn_ds:0.16666666666666669,sw_coverage:0.5238095238095238
+        if line.startswith("chrom"):
+            continue
+        else:
+            line = line.rstrip()
+            line = line.split("\t")
+            chrom = line[0]
+            chromStart = line[1]
+            chromEnd = line[2]
+            name = chrom+"-"+chromStart
+            strand = line[3]
+            toleranceScore = str(round(float(line[4].split(",")[0].split(":")[1]),3))
+            coverage = str(round(float(line[4].split(",")[1].split(",")[0].split(":")[1]),3))
+            #bed 4+3
+            bigBedOutputFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+name+"\t"+toleranceScore+"\t"+strand+"\t"+coverage+"\n")
+
+    bash("sort -k1,1 -k2,2n /hive/data/genomes/hg19/bed/metaDome/metaDome > /hive/data/genomes/hg19/bed/metaDome/metaDome.sorted.bed")
+    bash("bedToBigBed -type=bed4+3 -as=/hive/data/genomes/hg19/bed/metaDome/metaDome.as /hive/data/genomes/hg19/bed/metaDome/metaDome.sorted.bed /hive/data/genomes/hg19/bed/metaDome/hg19.chrom.sizes /hive/data/genomes/hg19/bed/metaDome/metaDome.bb")
+      
+
+inputFile = "/hive/data/genomes/hg19/bed/metaDome/MetaDome_v1.0.1_gnomAD_r2.0.2_dnds_sw_size_10_052022.sorted.bed"
+outputFile = "/hive/data/genomes/hg19/bed/metaDome/metaDome.bedGraph"
+bigWigFile = "/hive/data/genomes/hg19/bed/metaDome/metaDome.bw"
+bigBedOutputFile = open("/hive/data/genomes/hg19/bed/metaDome/metaDome","w")
+
+
+metaDomeFile = open(inputFile,'r')
+resultsFile = open(outputFile,'w')
+
+metaDome = parseFileBuildDic(metaDomeFile)
+writeOutFile(metaDome,resultsFile)
+makeMetaDomeBigBed(metaDomeFile,bigBedOutputFile)
+metaDomeFile.close()
+resultsFile.close()
+bigBedOutputFile.close()
+createBigWig(outputFile,bigWigFile)
+
+bash("ln -s /hive/data/genomes/hg19/bed/metaDome/metaDome.bw /gbdb/hg19/metaDome/")
+bash("ln -s /hive/data/genomes/hg19/bed/metaDome/metaDome.bb /gbdb/hg19/metaDome/")