72924c5113cc9889caa9ec788cb1fff7ddcee17f
lrnassar
  Mon Jun 17 16:37:50 2024 -0700
Staging updated/new SpliceAI data which is now QA Ready. Refs #27141

diff --git src/hg/makeDb/scripts/spliceAI/spliceAI.py src/hg/makeDb/scripts/spliceAI/spliceAI.py
new file mode 100644
index 0000000..efe714d
--- /dev/null
+++ src/hg/makeDb/scripts/spliceAI/spliceAI.py
@@ -0,0 +1,74 @@
+import sys, csv, gzip
+import subprocess
+
+allFiles = {'hg19MaskedIndel': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.masked.indel.hg19.vcf.gz',\
+           'hg38MaskedIndel': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.masked.indel.hg38.vcf.gz',\
+           'hg19MaskedSnv': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.masked.snv.hg19.vcf.gz',\
+           'hg38MaskedSnv': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.masked.snv.hg38.vcf.gz',\
+           'hg19RawIndel': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.raw.indel.hg19.vcf.gz',\
+           'hg38RawIndel': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.raw.indel.hg38.vcf.gz',\
+           'hg19RawSnv': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.raw.snv.hg19.vcf.gz',\
+           'hg38RawSnv': '/hive/data/outside/spliceAi/scoresFromIllumina/spliceai_scores.raw.snv.hg38.vcf.gz'}
+
+def processAndMakeBedFile(dbsAndMasking,filePath):
+    with gzip.open(filePath, 'rt') as f:
+        with open('/hive/data/outside/spliceAi/'+dbsAndMasking+'.bed', 'w', newline='', encoding='utf-8') as outfile1:
+            AIwriter = csv.writer(outfile1, delimiter='\t')
+            atypes = {'acceptor_gain' : '255,0,0', 
+              'acceptor_loss' : '255,128,0', 
+              'donor_gain' : '0,0,255', 
+              'donor_loss' : '212,0,255'}
+            for line in f:
+                if line.startswith('#'):
+                    continue
+                [chrom, pos, id, ref, alt, qual, filter, info] = line.strip().split('\t')
+                startpos = int(pos) -1
+                # match scores with positions
+                name = info.split('|')[1]
+                scores = [float(s) for s in info.split('|')[2:6]]
+                positions = [int(s) for s in info.split('|')[6:10]]
+                # Iterate over the zipped data
+                for atype, score, position in zip(atypes.keys(), scores, positions):
+                    # Check if the score is greater than or equal to 0.02
+                    if score >= 0.02:
+                        # make clear if position is upstream or downstream
+                        if position > 0:
+                            position = '+' + str(position)
+                  #      print(f"Type: {atype}, Score: {score}, Position: {position}")
+                        AIwriter.writerow(['chr'+chrom, startpos, startpos+1, ref+'>'+alt, 0, '+', startpos, startpos, atypes[atype], score, atype, position, name])
+
+def bash(cmd):
+    """Run the cmd in bash subprocess"""
+    try:
+        rawBashOutput = subprocess.run(cmd, check=True, shell=True,\
+                                       stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT)
+        bashStdoutt = rawBashOutput.stdout
+    except subprocess.CalledProcessError as e:
+        raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output))
+    return(bashStdoutt)
+
+for file in allFiles:
+    processAndMakeBedFile(file,allFiles[file])
+    
+for track in allFiles:
+    trackName = track
+    originalFileUrl = allFiles[track]
+    bedFile = "/hive/data/outside/spliceAi/"+trackName+".bed"
+    if "hg19" in trackName:
+        dbs = "hg19"
+    else:
+        dbs = "hg38"
+    if "Masked" in trackName:
+        if "Snv" in trackName:
+            BBname = "spliceAIsnvsMasked"
+        else:
+            BBname = "spliceAIindelsMasked"
+    else:
+        if "Snv" in trackName:
+            BBname = "spliceAIsnvs"
+        else:
+            BBname = "spliceAIindels"
+    bash("bedToBigBed -type=bed9+4 -tab -as=/hive/data/outside/spliceAi/spliceAI.as "+bedFile+" \
+    /hive/data/genomes/"+dbs+"/chrom.sizes /hive/data/outside/spliceAi/"+dbs+"/"+BBname+".bb")
+
+    bash("ln -sf /hive/data/outside/spliceAi/"+dbs+"/"+BBname+".bb /gbdb/"+dbs+"/bbi/"+BBname+".bb")