84e874dbd850363c0e7169978dcffc031d8f75f6 jnavarr5 Fri Sep 19 12:07:03 2025 -0700 Adding the DDG2P make doc, refs #35054 diff --git src/hg/makeDb/doc/hg38/ddg2p.txt src/hg/makeDb/doc/hg38/ddg2p.txt new file mode 100644 index 00000000000..48c7ec24f76 --- /dev/null +++ src/hg/makeDb/doc/hg38/ddg2p.txt @@ -0,0 +1,191 @@ +######################################################################### +# DDG2P (09-19-2025) Jaidan Jenkins-Kiefer + +# Source: https://www.ebi.ac.uk/gene2phenotype/ +# G2P_DD_2025-07-30.csv + +############################################################ +# Obtain gene coordinates with Table Browser +############################################################ +# Go to: https://genome.ucsc.edu/cgi-bin/hgTables +# Assembly: hg38 +# Group: Genes and Gene Predictions +# Track: HGNC +# Table: hgnc +# Region: genome +# Identifiers (names/accessions): Upload list of HGNC IDs from G2P_DD_2025-07-30.csv (4th column) +# Output format: selected fields from primary table +# Fields selected: chrom, txStart, txEnd, name2, name, strand +# Output file: hg38_gene_coords.tsv + +############################################################ +# Merge DDG2P with gene coords +############################################################ +python3 +import pandas as pd +#!/usr/bin/python3 +import csv +import sys +from pathlib import Path + +def confidence_to_color(confidence): + """ Map a confidence string to an RGB color string for UCSC BED itemRgb.""" + color_map = { + "definitive": "0,128,0", # green + "strong": "0,0,255", # blue + "moderate": "255,165,0", # orange + "limited": "255,0,0", # red + "refuted": "128,128,128" # gray + } + return color_map.get(confidence.lower(), "0,0,0") # default black + + +def load_g2p(file_path): + """Load G2P CSV into dict keyed by HGNC ID, plus list of missing IDs.""" + g2p_map = {} + with open(file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + hgnc_id = row["hgnc id"].strip() + if hgnc_id not in g2p_map: + g2p_map[hgnc_id] = [] + g2p_map[hgnc_id].append(row) + return g2p_map + + +def load_coordinates(file_path): + """Load coordinates TSV into dict keyed by HGNC ID.""" + coord_map = {} + with open(file_path, newline='', encoding='utf-8') as tsvfile: + reader = csv.DictReader(tsvfile, delimiter="\t") + for row in reader: + if row["name"].startswith("HGNC:"): + hgnc_id = row["name"].split(":")[1] + coord_map[hgnc_id] = row + return coord_map + + +def join_and_write(g2p_data, coords, output_file): + """Join G2P and coordinates into BED 8+19 format.""" + with open(output_file, "w", newline='', encoding="utf-8") as out: + writer = csv.writer(out, delimiter="\t") + + for hgnc_id, rows in g2p_data.items(): + for row in rows: + coord = coords.get(hgnc_id, None) + + # BED 9 fields + chrom = coord["#chrom"] if coord else "" + chromStart = coord["chromStart"] if coord else "0" + chromEnd = coord["chromEnd"] if coord else "0" + name = row["gene symbol"] + score = coord["score"] if coord else "0" + strand = coord["strand"] if coord else "+" + thickStart = coord["thickStart"] if coord else "0" + thickEnd = coord["thickEnd"] if coord else "0" + rgb = confidence_to_color(row["confidence"]) + + # G2P 20 fields + g2p_id = row["g2p id"] + gene_mim = row["gene mim"] + hgnc_id_val = row["hgnc id"] + prev_symbols= row["previous gene symbols"] + disease_name= row["disease name"] + disease_mim = row["disease mim"] + disease_MONDO = row["disease MONDO"] + allelic_req = row["allelic requirement"] + cross_mod = row["cross cutting modifier"] + confidence = row["confidence"] + var_conseq = row["variant consequence"] + var_types = row["variant types"] + mol_mech = row["molecular mechanism"] + mol_mech_cat= row["molecular mechanism categorisation"] + mol_mech_ev = row["molecular mechanism evidence"] + phenotypes = row["phenotypes"] + publications= row["publications"] + panel = row["panel"] + comments = row["comments"] + date_review = row["date of last review"] + + # Write BED 9 + 20 + writer.writerow([ + chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, + rgb, g2p_id, gene_mim, hgnc_id_val, prev_symbols, disease_name, disease_mim, + disease_MONDO, allelic_req, cross_mod, confidence, var_conseq, var_types, + mol_mech, mol_mech_cat, mol_mech_ev, phenotypes, publications, panel, + comments, date_review + ]) + + +if __name__ == "__main__": + if len(sys.argv) != 4: + print(f"Usage: {sys.argv[0]} ") + sys.exit(1) + + g2p_file = Path(sys.argv[1]) + coord_file = Path(sys.argv[2]) + output_file = Path(sys.argv[3]) + + g2p_data = load_g2p(g2p_file) + coords_data = load_coordinates(coord_file) + + join_and_write(g2p_data, coords_data, output_file) + print(f"Output written to {output_file}") + +############################################################ +# Run the python script to add coordinates +############################################################ +./ddg2p_merge.py G2P_DD_2025-07-30.csv hg19_gene_coords.sorted.tsv hg19bed + +############################################################ +# DDG2P autoSQL file +############################################################ +table ddg2p +"Developmental Disorders (DD) panel in the Gene2Phenotype (G2P) database (DDG2P) - BED 9+20" +( + string chrom; "Reference sequence chromosome or scaffold" + uint chromStart; "Start position of feature on chromosome" + uint chromEnd; "End position of feature on chromosome" + string name; "Gene symbol" + uint score; "Score" + char[1] strand; "+ or - for strand" + uint thickStart; "Coding region start" + uint thickEnd; "Coding region end" + uint itemRGB; "Color based on confidence (R,G,B values)" + + # ----- 20 additional custom fields ----- + string g2p_id; "G2P ID" + string gene_mim; "Gene MIM ID" + string hgnc_id; "HGNC ID" + string previous_gene_symbols; "List of previous gene symbols" + lstring disease_name; "Disease name" + string disease_mim; "Disease MIM ID" + string disease_MONDO; "MONDO ID" + string allelic_requirement; "Number of alleles affected to cause the relevant disease" + string cross_cutting_modifier; "Optional cross-cutting modifiers giving extra info" + string confidence; "Likelihood that the gene-disease association is true" + string variant_consequence; "SO terms for the variant consequence" + string variant_types; "SO terms for variant types" + string molecular_mechanism; "Molecular mechanism" + string molecular_mechanism_categorisation; "Categorisation of the molecular mechanism" + lstring molecular_mechanism_evidence; "Evidence to determine the disease mechanism" + lstring phenotypes; "Human phenotype ontology IDs" + lstring publications; "Pubmed IDs" + string panel; "Disease grouping or defined clinical category" + lstring comments; "Comments added by online curators" + string date_of_last_review; "Date of last review" +) + +############################################################ +# Sort and build BigBed +############################################################ +sort -k1,1 -k2,2n hg19bed > ddg2p_hg19.sorted +sort -k1,1 -k2,2n hg38bed > ddg2p_hg38.sorted + +fetchChromSizes hg19 > hg19.chrom.sizes +fetchChromSizes hg38 > hg38.chrom.sizes + +bedToBigBed -type=bed9+20 -as=../ddg2p.as -tab hg19/ddg2p_hg19.sorted hg19.chrom.sizes DDG2P_hg19.bb +bedToBigBed -type=bed9+20 -as=../ddg2p.as -tab hg38/ddg2p_hg38.sorted hg38.chrom.sizes DDG2P_hg38.bb + +