5eea974e049ff8f2c80085c31a50ddae484529ae gperez2 Sun Apr 21 14:19:46 2024 -0700 Making hg19 abSplice into a native track, refs #33251 diff --git src/hg/makeDb/doc/hg19.txt src/hg/makeDb/doc/hg19.txt index 35efec8..9ac989c 100644 --- src/hg/makeDb/doc/hg19.txt +++ src/hg/makeDb/doc/hg19.txt @@ -35341,15 +35341,218 @@ cd bayesDel #Makes symlinks for file in $(ls /hive/data/outside/bayesDelTrack/hg19/*.bw) ; do ln -s $file; done cd /hive/data/outside/bayesDelTrack/hg19 cp template.html bayesDel.html cp trackDb.txt bayesDel.ra vi bayesDel.ra #Indented subtracks and changed bigDataUrl (:%s;hg19;;g) #Copies trackDb file and html file cp bayesDel* ~/kent/src/hg/makeDb/trackDb/human/hg19 cd ~/kent/src/hg/makeDb/trackDb/human/ # Made predictionScoresSuper.ra file to make a superTrack and include bayesDel.ra cp hg19/bayesDel.html . mv bayesDel.html predictionScoresSuper.html vi predictionScoresSuper.html #Edited the text for predictionScoresSuper.html ############################################################################## +# Mar. 18, 2024 - Jeltje van Baren refs #33251 + +#! /bin/bash + +mkdir /hive/data/genomes/hg19/bed/absplice +cd /hive/data/genomes/hg19/bed/absplice +wget 'https://zenodo.org/records/10776853/files/AbSplice_DNA_hg19_snvs_high_scores.zip' +unzip AbSplice_DNA_hg19_snvs_high_scores.zip +# Unpacks in per-gene files in three directories, with score_cutoffs 0.01, 0.05 and 0.2 + +# it took some doing to figure out the genome version used for the annotation +# ended up getting version gene IDs from +# /hive/data/genomes/hg19/bed/gencodeV${i}lift37/(hgcImport/)tables/wgEncodeGencodeGeneSourceV${i}lift37.tab +# and comparing them to the ones in +# /hive/data/genomes/hg19/bed/absplice/AbSplice_DNA_hg19_snvs_high_scores/score_cutoff\=0.01/ +# only version34 has no missing gene IDs + +# using the lowest cutoff (most files) for this track +# the files have coordinates and gene IDs but not strand info, and I want to add this to the track +wget https://ftp.ebi.ac.uk/pub/databases/gencode/Gencode_human/release_34/GRCh37_mapping/gencode.v34lift37.annotation.gff3.gz +zcat gencode.v34lift37.annotation.gff3.gz | grep -P '\tgene\t' | cut -f7,9 | sed 's/ID=.*gene_id=//' | \ + sed 's/\..*gene_name=/\t/' | sed 's/;.*//' > hg19.gene.strands + +# we have to loop over the files because they have different numbers of columns. +# the info we want is in the (per-tissue) columns starting with ABSplice_DNA_ +# we want to display the tissue(s) with the highest ABSplice_DNA score on mouseover, +# and a full table on clicking + +/cluster/home/jeltje/miniconda3/bin/python3.11 - << END + +import os +import csv +import gzip +from multiprocessing import Pool + +import sys +sys.exit() + +indir='/hive/data/genomes/hg19/bed/absplice/AbSplice_DNA_hg19_snvs_high_scores/score_cutoff=0.01/' +outbase='AbSplice.hg19' +strandfile='hg19.gene.strands' + +def itemRGB(score): + '''Return color based on score''' + # https://github.com/gagneurlab/absplice?tab=readme-ov-file#output + # score cutoffs should be 0.2, 0.05 and 0.01 + rgb = '255,255,255' # black + # red (high) , orange, blue (low) + for cutoff, color in [[0.2, '255,0,0'], [0.05, '255,128,0'], [0.01, '0,0,255']]: + if score >= cutoff: + return color + return rgb + +def process_gzipped_file(file_path, outAB, strands): + '''Extract AbSplice from one gzipped file''' + + with gzip.open(file_path, 'rt') as infile: + reader = csv.DictReader(infile, delimiter='\t') + header = reader.fieldnames + + # Find the indices of columns starting with 'AbSplice_DNA' + indices_to_keep = [i for i, col in enumerate(header) if col.startswith('AbSplice_DNA')] + header = [column.replace('ABSplice_DNA_', '') for column in header] + + # Open the output files with the csv.writer + with open(outAB, 'a') as outfile1: + # Create a csv.writer object with tab as the delimiter + ABwriter = csv.writer(outfile1, delimiter='\t') + + # Iterate over rows in the input file and write selected columns to the output file + for row in reader: + if row['chrom'] == 'chrom': + continue + # Get the index and value for each column in indices_to_keep + all_values = [(header[i], row[header[i]]) for i in indices_to_keep] + # In this (but not the previous) version of the data, the final value is AbSplice_DNA_max + max_value = float(all_values.pop()[1]) # remove from list + + # turn this information into a html table + html_table = '<table>' + for tvals in all_values: + html_table += f' <tr><td>{tvals[0]}</td><td>{tvals[1]}</td></tr>' + html_table += '</table>' + + # Find the top 10% maximum values (first make sure all row entries are floats) + all_values = [(x, float(y) if y else 0) for x, y in all_values] + if max_value == 0: + topValString = 'No tissues with scores > 0' + else: + threshold = 0.1 * max_value + # Filter max_values to include only entries with the top 10% of values + max_entries = {column: value for column, value in all_values if value > max_value - threshold} + + # mouseover information + topValString = 'Max scores in <br>' + topValString += '<br>'.join([f'{column}: {value}' for column, value in max_entries.items()]) + + # this will be the item label + name = f"{row['ref']}>{row['alt']}" + + # AB coordinates appear to be 1-based + startpos = int(row['pos']) -1 + [strand, hugo] = strands[row['gene_id']] + ABwriter.writerow([row['chrom'], startpos, startpos+1, name, 0, strand, startpos, startpos, itemRGB(max_value), row['gene_id'], hugo, max_value, topValString, html_table]) + +# Main + +# read the strands +strands = dict() +with open(strandfile, 'r') as infile: + for line in infile: + strand, gene, hugo = line.strip().split('\t') + strands[gene] = [strand, hugo] + +# Do not append to existing files +ABoutfile = outbase + '.ab.bed' +if os.path.exists(ABoutfile): + os.remove(ABoutfile) + +# Get a list of all files in the directory +gzfiles = [os.path.join(indir, filename) for filename in os.listdir(indir) if filename.endswith(".gz")] + +# Parallel process on 8 threads +with Pool(8) as pool: + # Map the process_gzipped_file function to the list of files + pool.starmap(process_gzipped_file, [(infile, ABoutfile, strands) for infile in gzfiles]) +END + + + +# this created AbSplice.hg19.ab.bed +# duplicates happen when genes overlap, e.g. chr9:136,741,919-136,741,924 +# when this happens we want to display only the higher score +# sort, then filter for duplicates +sort -k1,1 -k2,2n AbSplice.hg19.ab.bed | /cluster/home/jeltje/miniconda3/bin/python3.11 <( + cat << "END" +import sys + +printline = False +prevcoord = '0' +prevallele = False +hiscore = 0 +for line in sys.stdin: + fields = line.split('\t') + score = float(fields[11]) + # check if startpos and name (alleles) are identical + if fields[1] == prevcoord and fields[3] == prevallele: + if score > hiscore: + hiscore = score + printline = line + # if not identical, print the previous line and start over + else: + if printline: + print(printline, end='') + printline = line + prevcoord = fields[1] + prevallele = fields[3] + hiscore = score +print(printline, end='') +END +) > filtered.hg19.bed + +# Create custom as file for this bigBed: +cat << '_EOF_' > AbSplice.as +table abSplice +"Bed 9+5 file with Ensembl Gene IDs and ABsplice values per tissue." + ( + string chrom; "Chromosome (or contig, scaffold, etc.)" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name of item" + uint score; "Score from 0-1000" + char[1] strand; "+ or -" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "Used as itemRgb as of 2004-11-22" + string ENSGid; "Ensembl Gene ID" + string hugoId; "hugo Gene ID" + float spliceABscore; "AbSplice highest score for this position" + lstring maxScore; "All tissues containing the highest score" + lstring tissues; "All 49 GTEX tissues with ABSplice value (empty if none were provided)" + ) +_EOF_ + +bedToBigBed -type=bed9+5 -tab -as=AbSplice.as filtered.hg19.bed /hive/data/genomes/hg19/chrom.sizes ~/public_html/trackHubs/AbSplice_hub/hg19/AbSplice.bb + +-------------------------------------- +# Update ABSplice hg19 #33251 (2024-03-18 Gerardo) +cd /hive/data/genomes/hg19/bed/ +mkdir abSplice; cd abSplice +cp /cluster/home/jeltje/public_html/trackHubs/AbSplice_hub/hg19/AbSplice.bb . +cp /cluster/home/jeltje/public_html/trackHubs/AbSplice_hub/hg19/description.html . +cp /cluster/home/jeltje/public_html/trackHubs/AbSplice_hub/hg19/trackDb.txt . +mv description.html abSplice.html +mv trackDb.txt abSplice.ra +cp abSplice.html ~/kent/src/hg/makeDb/trackDb/human/hg19 +cp abSplice.ra ~/kent/src/hg/makeDb/trackDb/human/hg19 +cd /gbdb/hg19/ +mkdir abSplice; cd abSplice +ln -s /hive/data/genomes/hg19/bed/abSplice/AbSplice.bb +cd ~/kent/src/hg/makeDb/trackDb/human/hg19 +vi abSplice.ra #Indented subtracks, changed bigDataUrl, adding group phenDis and html line +##############################################################################