ea3708197af0c9d0a081a656bf9a88b8aaa68f15 jnavarr5 Tue Nov 5 10:55:10 2024 -0800 Adding the makedoc for the DDG2P track, refs #34097 diff --git src/hg/makeDb/doc/hg38/decipher.txt src/hg/makeDb/doc/hg38/decipher.txt new file mode 100644 index 0000000..02a635b --- /dev/null +++ src/hg/makeDb/doc/hg38/decipher.txt @@ -0,0 +1,102 @@ +######################################################################################## +# DECIPHER Developmental Disorders panel in the Gene2Phenotype database (DDG2P) +# November 5, 2024 - Yesenia Puga, Jairo Navarro, Gerardo Perez + +# Download required files +wget https://www.deciphergenomics.org/files/downloads/population_cnv_grch38.txt.gz # Downloads the CNV data file for hg38 +wget http://hgdownload.soe.ucsc.edu/goldenPath/hg38/bigZips/hg38.chrom.sizes # Downloads chromosome size data for hg38 +wget https://genome.ucsc.edu/goldenPath/help/examples/bedExample2.as # Downloads the .as file defining custom track fields +wget http://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/bedToBigBed # Downloads the bedToBigBed utility for converting BED to bigBed format +chmod 700 bedToBigBed # Changes permissions to make bedToBigBed executable + +# Prepare the BED file: +# Decompresses and trims the CNV data to the first 15 fields +zcat population_cnv_grch38.txt.gz | cut -f1-15 > population_cnv_grch38.bed + +# reorders columns to fit BED format +awk 'BEGIN {OFS="\t"} {print $2, $3, $4, $1, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38.bed > population_cnv_grch38_reordered.bed + +# removes the header line +tail -n +2 population_cnv_grch38_reordered.bed > population_cnv_grch38_final.bed + +# Adjust the BED file for bigBed conversion: +# prepends 'chr' to chromosome numbers and adjusts fields for bigBed +awk 'BEGIN{OFS="\t"} {print "chr"$1, $2, $3, $4, 0, ".", $2, $3, 0, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15}' population_cnv_grch38_final.bed > population_cnv_grch38_final_chr.bed + +# Sort the BED file: +# sorts lexicographically by chromosome and numerically by start position +LC_ALL=C sort -k1,1 -k2,2n population_cnv_grch38_final_chr.bed > population_cnv_grch38_final_sorted.bed + + +# Add RGB colors to the BED file using a Python script: +python3 ../assign_rgb_to_bed.py population_cnv_grch38_final_sorted.bed output_population_cnv_grch38.bed + +# Convert the BED file to bigBed format, indexing by gene name for faster lookups: +./bedToBigBed -type=bed9+ -as=bedExample2.as -tab -extraIndex=name output_population_cnv_grch38.bed hg38.chrom.sizes population_cnv_grch38.bb + +####################################################### +Author: Yesenia Puga +Program: assign_rgb_to_bed.py +####################################################### + # The Python script 'assign_rgb_to_bed.py' is crucial in the data processing pipeline. + # It assigns RGB color values based on CNV type ('loss', 'gain', 'del/dup') + # to each entry, enhancing visual differentiation on the Genome Browser. It also updates the + # 'type' column from numeric identifiers to descriptive text, + # which is used in the browser for informative mouseover tooltips, aiding in quick and clear + # variant identification. +import csv +import sys + +# Define the color mappings for the RGB column based on the CNV type +def get_rgb_color(cnv_type): + if cnv_type == "loss": + return '255,0,0' # Red + elif cnv_type == "gain": + return '0,0,255' # Blue + elif cnv_type == "del/dup": + return '128,128,128' # Grey + else: + raise ValueError(f"Unexpected CNV type: {cnv_type}") + +# Convert numeric CNV values to descriptive text +cnv_descriptions = { + '-1': 'loss', + '0': 'del/dup', + '1': 'gain' +} + +def process_file(input_bed_file, output_bed_file): + with open(input_bed_file, 'r') as infile, open(output_bed_file, 'w', newline='') as outfile: + reader = csv.reader(infile, delimiter='\t') + writer = csv.writer(outfile, delimiter='\t') + + for row in reader: + if len(row) < 19: + raise ValueError("Row does not contain enough columns to include the CNV type.") + + cnv_type_value = row[18] + if cnv_type_value not in cnv_descriptions: + raise ValueError(f"Invalid CNV type value found: {cnv_type_value}") + + cnv_type_description = cnv_descriptions[cnv_type_value] + row[18] = cnv_type_description # Update the type description in the 19th column + rgb_value = get_rgb_color(cnv_type_description) + + if len(row) < 9: + raise ValueError("Row does not contain enough columns to include the RGB value.") + + row[8] = rgb_value # Update RGB value in the 9th column + + writer.writerow(row) + + print(f"Updated file successfully. Output saved to {output_bed_file}.") + +if __name__ == "__main__": + if len(sys.argv) != 3: + print("Usage: python update_rgb.py <input_bed_file> <output_bed_file>") + else: + input_bed_file = sys.argv[1] + output_bed_file = sys.argv[2] + process_file(input_bed_file, output_bed_file) + +########################################################################################