src/hg/makeDb/doc/hg38/spliceAI.txt cec653ec536e9a536256b3ebddc39be5ef9701c8

cec653ec536e9a536256b3ebddc39be5ef9701c8
max
  Sun Sep 21 23:37:18 2025 -0700
adding spliceAi wt track, refs #35100

diff --git src/hg/makeDb/doc/hg38/spliceAI.txt src/hg/makeDb/doc/hg38/spliceAI.txt
index 5d6c182c773..aacb0664506 100644
--- src/hg/makeDb/doc/hg38/spliceAI.txt
+++ src/hg/makeDb/doc/hg38/spliceAI.txt
@@ -1,70 +1,70 @@
 #! /bin/bash
 
 wget https://ftp.ensembl.org/pub/data_files/homo_sapiens/GRCh38/variation_plugins/spliceai_scores.masked.snv.ensembl_mane.grch38.110.vcf.gz -O /hive/data/outside/spliceAi/spliceai_scores.masked.snv.ensembl_mane.grch38.110.vcf.gz
 
 # the vcf does not have a 'chr' prefix, also remember that vcf is 1-based
 # fields are 
 #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO
 # but ID, QUAL, and FILTER are empty
 #INFO looks like this:
 #SpliceAI=G|OR4F5|0.01|0.00|0.00|0.00|-32|49|-40|-31
 ##INFO=<ID=SpliceAI,Number=.,Type=String,Description="SpliceAIv1.3 variant annotation. These include delta scores (DS) and delta positions (DP) for acceptor gain (AG), acceptor loss (AL), donor gain (DG), and donor loss (DL). Format: ALLELE|SYMBOL|DS_AG|DS_AL|DS_DG|DS_DL|DP_AG|DP_AL|DP_DG|DP_DL">
 
 cat << '_EOF_' > spliceAI.as
 table spliceAI
 "Bed 9+4 file with Ensembl Gene IDs and ABsplice values per tissue."
     (
     string chrom;      "Chromosome (or contig, scaffold, etc.)"
     uint   chromStart; "Start position in chromosome"
     uint   chromEnd;   "End position in chromosome"
     string name;       "Name of item"
     uint   score;      "Score from 0-1000"
     char[1] strand;    "+ or -"
     uint thickStart;   "Start of where display should be thick (start codon)"
     uint thickEnd;     "End of where display should be thick (stop codon)"
     uint reserved;     "Used as itemRgb as of 2004-11-22"
     float AIscore;       "spliceAI score"
     string spliceType; "donor_gain, donor_loss, acceptor_gain, or acceptor_loss"
     string relativePos;   "Relative location of donor or acceptor affected by this variant"
     lstring gene;      "Gene ID" 
     )
 _EOF_
 
 infile='/hive/data/outside/spliceAi/spliceai_scores.masked.snv.ensembl_mane.grch38.110.vcf.gz'
 #  note: older versions of python complain about f'' strings
 zcat $infile | python3.11 <(
     cat << "END"
 import sys, csv
 
 with open('spliceAI.bed', 'w', newline='', encoding='utf-8') as outfile1:
     AIwriter = csv.writer(outfile1, delimiter='\t')
 
     atypes = {'acceptor_gain' : '255,0,0', 
               'acceptor_loss' : '255,128,0', 
               'donor_gain' : '0,0,255', 
               'donor_loss' : '212,0,255'}
     for line in sys.stdin:
         if line.startswith('#'):
             continue
         [chrom, pos, id, ref, alt, qual, filter, info] = line.strip().split('\t')
         startpos = int(pos) -1
         # match scores with positions
         name = info.split('|')[1]
         scores = [float(s) for s in info.split('|')[2:6]]
         positions = [int(s) for s in info.split('|')[6:10]]
         # Iterate over the zipped data
         for atype, score, position in zip(atypes.keys(), scores, positions):
             # Check if the score is greater than or equal to 0.02
             if score >= 0.02:
                 # make clear if position is upstream or downstream
                 if position > 0:
                     position = '+' + str(position)
           #      print(f"Type: {atype}, Score: {score}, Position: {position}")
                 AIwriter.writerow(['chr'+chrom, startpos, startpos+1, ref+'>'+alt, 0, '+', startpos, startpos, atypes[atype], score, atype, position, name])
 
 END
 ) 
 bedToBigBed -type=bed9+4 -tab -as=spliceAI.as spliceAI.bed /hive/data/genomes/hg38/chrom.sizes ~/public_html/trackHubs/spliceAIhub/hg38/spliceAI.bb
 
-
+# Got splice AI wildtype files as-is from Michael Hiller