2f75ea14748d867f3ab9f56ccaa69bc74e9486a3 gperez2 Tue Jan 14 22:35:06 2025 -0800 Renaming/updating DDG2P track to DECIPHER Population CNVs, refs #35053 diff --git src/hg/makeDb/doc/hg38/decipher.txt src/hg/makeDb/doc/hg38/decipher.txt index 0190c43..219bd88 100644 --- src/hg/makeDb/doc/hg38/decipher.txt +++ src/hg/makeDb/doc/hg38/decipher.txt @@ -1,110 +1,147 @@ ######################################################################################## # DECIPHER Developmental Disorders panel in the Gene2Phenotype database (DDG2P), hg38/hg19 # November 5, 2024 - Yesenia Puga, Jairo Navarro, Gerardo Perez # Download required files wget https://www.deciphergenomics.org/files/downloads/population_cnv_grch38.txt.gz # Downloads the CNV data file for hg38 wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes # Downloads chromosome size data for hg38 wget https://genome.ucsc.edu/goldenPath/help/examples/bedExample2.as # Downloads the .as file defining custom track fields wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed # Downloads the bedToBigBed utility for converting BED to bigBed format chmod 700 bedToBigBed # Changes permissions to make bedToBigBed executable # Prepare the BED file: # Decompresses and trims the CNV data to the first 15 fields zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed # reorders columns to fit BED format awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38.bed > population_cnv_grch38_reordered.bed # removes the header line tail -n +2 population_cnv_grch38_reordered.bed > population_cnv_grch38_final.bed # Adjust the BED file for bigBed conversion: # prepends 'chr' to chromosome numbers and adjusts fields for bigBed awk 'BEGIN{OFS="\t"} {print "chr"$1, $2, $3, $4, 0, ".", $2, $3, 0, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38_final.bed > population_cnv_grch38_final_chr.bed # Sort the BED file: # sorts lexicographically by chromosome and numerically by start position LC_ALL=C sort -k1,1 -k2,2n population_cnv_grch38_final_chr.bed > population_cnv_grch38_final_sorted.bed # Add RGB colors to the BED file using a Python script: python3 ../assign_rgb_to_bed.py population_cnv_grch38_final_sorted.bed output_population_cnv_grch38.bed # Convert the BED file to bigBed format, indexing by gene name for faster lookups: ./bedToBigBed -type=bed9+ -as=bedExample2.as -tab -extraIndex=name output_population_cnv_grch38.bed hg38.chrom.sizes population_cnv_grch38.bb # Add files to hive cp hg19/population_cnv_grch37.bb /hive/data/genomes/hg19/bed/ddg2p/ddg2pSyndromes.bb cp hg38/population_cnv_grch38.bb /hive/data/genomes/hg38/bed/ddg2p/ddg2pSyndromes.bb # Create symlinks from hive to /gbdb ln -s /hive/data/genomes/hg19/bed/ddg2p/ddg2pSyndromes.bb /gbdb/hg19/decipher/ddg2pSyndromes.bb ln -s /hive/data/genomes/hg38/bed/ddg2p/ddg2pSyndromes.bb /gbdb/hg38/decipher/ddg2pSyndromes.bb ####################################################### Author: Yesenia Puga Program: assign_rgb_to_bed.py ####################################################### # The Python script 'assign_rgb_to_bed.py' is crucial in the data processing pipeline. # It assigns RGB color values based on CNV type ('loss', 'gain', 'del/dup') # to each entry, enhancing visual differentiation on the Genome Browser. It also updates the # 'type' column from numeric identifiers to descriptive text, # which is used in the browser for informative mouseover tooltips, aiding in quick and clear # variant identification. import csv import sys # Define the color mappings for the RGB column based on the CNV type def get_rgb_color(cnv_type): if cnv_type == "loss": return '255,0,0' # Red elif cnv_type == "gain": return '0,0,255' # Blue elif cnv_type == "del/dup": return '128,128,128' # Grey else: raise ValueError(f"Unexpected CNV type: {cnv_type}") # Convert numeric CNV values to descriptive text cnv_descriptions = { '-1': 'loss', '0': 'del/dup', '1': 'gain' } def process_file(input_bed_file, output_bed_file): with open(input_bed_file, 'r') as infile, open(output_bed_file, 'w', newline='') as outfile: reader = csv.reader(infile, delimiter='\t') writer = csv.writer(outfile, delimiter='\t') for row in reader: if len(row) < 19: raise ValueError("Row does not contain enough columns to include the CNV type.") cnv_type_value = row[18] if cnv_type_value not in cnv_descriptions: raise ValueError(f"Invalid CNV type value found: {cnv_type_value}") cnv_type_description = cnv_descriptions[cnv_type_value] row[18] = cnv_type_description # Update the type description in the 19th column rgb_value = get_rgb_color(cnv_type_description) if len(row) < 9: raise ValueError("Row does not contain enough columns to include the RGB value.") row[8] = rgb_value # Update RGB value in the 9th column writer.writerow(row) print(f"Updated file successfully. Output saved to {output_bed_file}.") if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: python update_rgb.py ") else: input_bed_file = sys.argv[1] output_bed_file = sys.argv[2] process_file(input_bed_file, output_bed_file) ######################################################################################## +# Renaming/updating DDG2P track to DECIPHER Population CNVs #35053 +# Januarary 14, 2025 - Gerardo Perez + +# Downloaded files +wget https://www.deciphergenomics.org/files/downloads/population_cnv_grch38.txt.gz +zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed +wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes +cp /cluster/home/yepuga/public_html/trackHubs/ddg2p_syndromes/hg38/bedExample2.as /hive/users/gperez2/tracks/decipher/hg38 +cp /cluster/home/yepuga/public_html/trackHubs/ddg2p_syndromes/hg38/assign_rgb_to_bed.py /hive/users/gperez2/tracks/decipher/hg38 + +# Working directory +cd /hive/users/gperez2/tracks/decipher/hg38 + +# Commands +zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed + +# reorders columns to fit BED format +awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38.bed > population_cnv_grch38_reordered.bed + +# removes the header line +tail -n +2 population_cnv_grch38_reordered.bed > population_cnv_grch38_final.bed + +# Adjust the BED file for bigBed conversion: +# prepends 'chr' to chromosome numbers, subtracts 1 from the start position, and adjusts fields for bigBed +awk 'BEGIN{OFS="\t"} {print "chr"$1, $2-1, $3, $4, 0, ".", $2, $3, 0, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38_final.bed > population_cnv_grch38_final_chr.bed + +bedSort population_cnv_grch38_final_chr.bed population_cnv_grch38_sorted.bed + +# Add RGB colors to the BED file using a Python script: +python3 assign_rgb_to_bed.py population_cnv_grch38_sorted.bed output_population_cnv_grch38.bed + +bedToBigBed -type=bed9+ -as=bedExample2.as -tab -extraIndex=name output_population_cnv_grch38.bed hg38.chrom.sizes population_cnv_grch38.bb + +# Moving files +cp hg38/population_cnv_grch38.bb /hive/data/genomes/hg38/bed/decipher/population_cnv.bb + +ln -s /hive/data/genomes/hg38/bed/decipher/population_cnv.bb /gbdb/hg38/decipher/population_cnv.bb