57105e3e6e8c590537fedcc559cb0fac2812d56d
lrnassar
  Thu Feb 22 14:33:24 2024 -0800
Staging the ENIGMA tracks, refs #32919

diff --git src/hg/makeDb/scripts/enigma/BRCAsplicing.py src/hg/makeDb/scripts/enigma/BRCAsplicing.py
new file mode 100644
index 0000000..e0fcf02
--- /dev/null
+++ src/hg/makeDb/scripts/enigma/BRCAsplicing.py
@@ -0,0 +1,164 @@
+import re
+import subprocess
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+rawFilePath = "/hive/data/inside/enigmaTracksData/Anna_CSpec_BRCA12ACMG-Rules-Specifications_V1.1Table-4_2023-11-22.txt"
+
+def assignRGBcolor(lineToCheck):
+    if "DEL" in lineToCheck[4]:
+        itemRgb = '180,53,53' #Red
+    elif "DUP" in lineToCheck[4]:
+        itemRgb = '117,136,214' #Blue
+    elif "RNA" in lineToCheck[5]:
+        itemRgb = '146,64,190' #Black
+    else: 
+        itemRgb = '0,0,0'
+    return(itemRgb)
+
+rawFile = open(rawFilePath,'r')
+outputBedFile = open("/hive/data/inside/enigmaTracksData/outputBedFile.bed",'w', encoding='latin-1')
+for line in rawFile:
+    line = line.rstrip("\n").split("\t")
+    if line[0].startswith("Gene") or line[0]== "":
+        continue
+    elif line == "":
+        continue
+    else:
+#         print(line)
+        itemRgb = assignRGBcolor(line)
+        NMacc = line[1]
+        
+        if line[0] == "BRCA1":
+            strand = "-"
+            chrom = "chr17"
+        else:
+            strand = "+"
+            chrom = "chr13"
+        
+        if len(line[3].split("c")) > 2:
+            firstPos=line[3].split(".")[1].split("-c")[0]
+            secondPos=line[3].split("-c.")[1]
+            queryPosition = bash("curl https://hgwdev.gi.ucsc.edu/cgi-bin/hgSearch?search="+NMacc+"%3Ac"+firstPos)
+            for resultsLine in queryPosition.split("\n"):
+                if resultsLine.startswith("<script"):
+                    if strand == "-":
+                        chromEnd = str(int(resultsLine.split("position=")[1].split("-")[0].split(":")[1])+5)
+                    elif strand == "+":
+                        chromStart = str(int(resultsLine.split("position=")[1].split("-")[0].split(":")[1])+4)
+            
+            queryPosition = bash("curl https://hgwdev.gi.ucsc.edu/cgi-bin/hgSearch?search="+NMacc+"%3Ac"+secondPos)
+            for resultsLine in queryPosition.split("\n"):
+                if resultsLine.startswith("<script"):
+                    if strand == "-":
+                        chromStart = str(int(resultsLine.split("position=")[1].split("-")[0].split(":")[1])+4)
+                    elif strand == "+":
+                        chromEnd = str(int(resultsLine.split("position=")[1].split("-")[0].split(":")[1])+5)
+
+        else:
+            p = re.compile('[0-9]')
+
+            if "-" in line[3]: #c.-20-1G>
+                if len(line[3].split("-")) > 2:
+                    pos = "-"+line[3].split("-")[1]
+                    adjustment = p.findall(line[3].split("-")[2])
+                    adjustment = "-"+adjustment[0]
+                elif line[3].startswith("c.-"): #c.-20+2T>
+                    pos = "-"+line[3].split("-")[1].split("+")[0]
+                    adjustment = p.findall(line[3].split("+")[1])
+                    adjustment = adjustment[0]
+                else: #c.20-2T>
+                    pos = line[3].split("c.")[1].split("-")[0]
+                    adjustment = p.findall(line[3].split("-")[1])
+                    adjustment = "-"+adjustment[0]
+            
+            elif "+" in line[3]: #c.475+1G>
+                pos = line[3].split("c.")[1].split("+")[0]
+                adjustment = p.findall(line[3].split("+")[1])[0]
+            else: #c.1
+                pos = line[3].split('c.')[1]
+                adjustment = 0
+                
+            queryPosition = bash("curl https://hgwdev.gi.ucsc.edu/cgi-bin/hgSearch?search="+NMacc+"%3Ac"+pos)
+            for resultsLine in queryPosition.split("\n"):
+                if resultsLine.startswith("<script"):
+                    position = int(resultsLine.split("position=")[1].split("-")[0].split(":")[1])+5
+            if strand == "-":
+                position = int(position)-int(adjustment)
+            elif strand == "+":
+                position = int(position)+int(adjustment)
+            chromStart = str(position-1)
+            chromEnd = str(position)
+        
+        varType = line[4].replace('"','')
+        ACMGcode = line[5].replace('"','')
+        observation = line[6].replace("?","delta").replace('"','')
+        _mouseOver = "<b>Transcript:</b> "+line[1]+"<br><b>Exon:</b> "+line[2]+\
+        "<br><b>Position:</b> "+line[3]+"<br><b>Var Type:</b> "+varType+\
+        "<br><b>ACMG Code: </b>"+ACMGcode
+        
+        outputBedFile.write(chrom+"\t"+chromStart+"\t"+chromEnd+"\t"+line[3]+"\t0\t"+\
+                           strand+"\t"+chromStart+"\t"+chromEnd+"\t"+itemRgb+"\t"+\
+                           "\t".join(line[:4])+"\t"+varType+"\t"+\
+                            ACMGcode+"\t"+observation+\
+                            "\t"+line[7]+"\t"+_mouseOver+"\n")
+            
+rawFile.close()
+outputBedFile.close()
+
+bash("bedSort /hive/data/inside/enigmaTracksData/outputBedFile.bed \
+/hive/data/inside/enigmaTracksData/outputBedFile.bed")
+
+startOfAsFile="""table BRCAsplicing
+"BRCA1 and BRCA2 variant codes according to PVS1 decision trees (ENIGMA specifications version 1.1.0)"
+   (
+   string chrom;       "Reference sequence chromosome or scaffold"
+   uint   chromStart;  "Start position in chromosome"
+   uint   chromEnd;    "End position in chromosome"
+   string name;        "Var location"
+   uint score;         "Not used, all 0"
+   char[1] strand;     "- or +"
+   uint thickStart;    "Same as chromStart"
+   uint thickEnd;      "Same as chromEnd"
+   uint reserved;       "RGB value (use R,G,B string in input file)"
+"""
+
+line1 = bash("head -1 "+rawFilePath)
+line1 = line1.rstrip("\n").split("\t")
+
+name = []
+for i in range(8): 
+    asFileAddition = "   lstring extraField"+str(i)+';\t"'+line1[i]+'"\n'
+    startOfAsFile = startOfAsFile+asFileAddition
+startOfAsFile = startOfAsFile+"   string _mouseOver;"+'\t"'+'Field only used as mouseOver'+'"\n'
+    
+asFileOutput = open("/hive/data/inside/enigmaTracksData/BRCAsplicing.as","w")
+for line in startOfAsFile.split("\n"):
+    if "_mouseOver" in line:
+        asFileOutput.write(line+"\n   )")
+    else:
+        asFileOutput.write(line+"\n")
+asFileOutput.close()
+
+bash("bedToBigBed -as=/hive/data/inside/enigmaTracksData/BRCAsplicing.as -type=bed9+9 -tab \
+/hive/data/inside/enigmaTracksData/outputBedFile.bed /cluster/data/hg38/chrom.sizes \
+/hive/data/inside/enigmaTracksData/BRCAsplicingHg38.bb")
+
+bash("liftOver -bedPlus=9 -tab /hive/data/inside/enigmaTracksData/outputBedFile.bed \
+/hive/data/genomes/hg38/bed/liftOver/hg38ToHg19.over.chain.gz \
+/hive/data/inside/enigmaTracksData/outputBedFileHg19.bed /hive/data/inside/enigmaTracksData/unmapped.bed")
+
+bash("bedToBigBed -as=/hive/data/inside/enigmaTracksData/BRCAsplicing.as -type=bed9+9 -tab \
+/hive/data/inside/enigmaTracksData/outputBedFileHg19.bed /cluster/data/hg19/chrom.sizes \
+/hive/data/inside/enigmaTracksData/BRCAsplicingHg19.bb")
+
+bash("ln -sf /hive/data/inside/enigmaTracksData/BRCAsplicingHg38.bb /gbdb/hg38/bbi/enigma/BRCAsplicing.bb")
+bash("ln -sf /hive/data/inside/enigmaTracksData/BRCAsplicingHg19.bb /gbdb/hg19/bbi/enigma/BRCAsplicing.bb")