0c05142eca44cb0c609e93edb2bf2b570b2951ad jnavarr5 Wed Nov 19 13:32:23 2025 -0800 Adding the makedoc for the G2P track, refs #36469 diff --git src/hg/makeDb/doc/hg38/g2p.txt src/hg/makeDb/doc/hg38/g2p.txt new file mode 100644 index 00000000000..d883b6e7cae --- /dev/null +++ src/hg/makeDb/doc/hg38/g2p.txt @@ -0,0 +1,165 @@ +######################################################################### +# G2P (09-19-2025) Jaidan Jenkins-Kiefer, Jairo Navarro +# Script to generate the track was created by Jaidan and updated by Jairo + +########################################################### +#!/usr/bin/python3 + +############################################################ +# Merge G2P with gene coords from HGNC +############################################################ +import pandas as pd +import csv +import sys +from pathlib import Path + +import subprocess +from io import BytesIO +#from zipfile import ZipFile +from urllib.request import urlopen +from ucscGb.qa.tables import trackUtils +from ucscGb.qa import qaUtils + +def bash(cmd): + """Run the cmd in bash subprocess""" + try: + rawBashOutput = subprocess.run(cmd, check=True, shell=True,\ + stdout=subprocess.PIPE, universal_newlines=True, stderr=subprocess.STDOUT) + bashStdoutt = rawBashOutput.stdout + except subprocess.CalledProcessError as e: + raise RuntimeError("command '{}' return with error (code {}): {}".format(e.cmd, e.returncode, e.output)) + return (bashStdoutt) + +def download(URL): + ''' + Generic command to wget a URL and prints the command to STDOUT. + ''' + cmd = "curl -L -o AllG2P.csv %s" % URL + bash(cmd) + +def confidence_to_color(confidence): + """ Map a confidence string to an RGB color string for UCSC BED itemRgb.""" + color_map = { + "definitive": "39,103,73", # Dark-green + "strong": "56,161,105", # green + "moderate": "104,211,145", # light-green + "limited": "252,129,129", # pink + "disputed": "229,62,62", # red + "refuted": "155,44,44" # Dark-red + } + return color_map.get(confidence.lower(), "0,0,0") # default black + + +def load_g2p(file_path): + """Load G2P CSV into dict keyed by HGNC ID, plus list of missing IDs.""" + g2p_map = {} + numOfRows = 0 + with open(file_path, newline='', encoding='utf-8') as csvfile: + reader = csv.DictReader(csvfile) + for row in reader: + numOfRows += 1 + hgnc_id = row["hgnc id"].strip() + if hgnc_id not in g2p_map: + g2p_map[hgnc_id] = [] + g2p_map[hgnc_id].append(row) + print ("Number of rows in file: %s" % numOfRows) + return g2p_map + + +def load_coordinates(db,hgnc_ids): + """Creates a dictionary for the coordinates given a list of HGNC IDs.""" + coord_map = {} + hgncBB = "/gbdb/%s/hgnc/hgnc.bb" % db + for id in hgnc_ids: + if id not in coord_map: + hgnc_data = bash("bigBedNamedItems %s HGNC:%s stdout" % (hgncBB,id)).split('\n') + split_data = [line.split('\t') for line in hgnc_data if line.strip()] + final_data = [] + for row in split_data: + final_data.append(row[:8]) + coord_map[id] = final_data + return coord_map + + +def join_and_write(g2p_data, coords, output_file): + """Join G2P and coordinates into BED 8+19 format.""" + with open(output_file, "w", newline='', encoding="utf-8") as out: + writer = csv.writer(out, delimiter="\t") + for hgnc_id, rows in g2p_data.items(): # Rows is a list of dictionaries + for row in rows: + for coord in coords.get(hgnc_id): + # BED 9 fields + chrom = coord[0] + chromStart = coord[1] + chromEnd = coord[2] + name = row["gene symbol"] + score = coord[4] + strand = coord[5] + thickStart = coord[6] + thickEnd = coord[7] + rgb = confidence_to_color(row["confidence"]) + + # G2P 20 fields + g2p_id = row["g2p id"] + gene_mim = row["gene mim"] + hgnc_id_val = row["hgnc id"] + prev_symbols= row["previous gene symbols"].replace(';',',') + disease_name= row["disease name"] + disease_mim = row["disease mim"] + disease_MONDO = row["disease MONDO"] + allelic_req = row["allelic requirement"] + cross_mod = row["cross cutting modifier"] + confidence = row["confidence"] + var_conseq = row["variant consequence"] + var_types = row["variant types"] + mol_mech = row["molecular mechanism"] + mol_mech_cat= row["molecular mechanism categorisation"] + mol_mech_ev = row["molecular mechanism evidence"] + phenotypes = row["phenotypes"].replace(';',',') + publications= row["publications"].replace(';',',') + panel = row["panel"] + comments = row["comments"] + date_review = row["date of last review"] + + # Write BED 9 + 20 + writer.writerow([ + chrom, chromStart, chromEnd, name, score, strand, thickStart, thickEnd, + rgb, g2p_id, gene_mim, hgnc_id_val, prev_symbols, disease_name, disease_mim, + disease_MONDO, allelic_req, cross_mod, confidence, var_conseq, var_types, + mol_mech, mol_mech_cat, mol_mech_ev, phenotypes, publications, panel, + comments, date_review + ]) + + +if __name__ == "__main__": + # Databases in G2P + database = ["hg19", "hg38"] + # Download the CSV file from G2P + download("https://www.ebi.ac.uk/gene2phenotype/api/panel/all/download") + g2p_file = Path("AllG2P.csv") # Name of the file is in the curl command above + g2p_data = load_g2p(g2p_file) # Create a dictionary for each HGNC ID with the columns + hgnc_ids = g2p_data.keys() # HGNC IDs are stored as '36036' + print("Number of HGNC IDs found: %s" % len(hgnc_ids)) + # Get coordinates for hg38 and hg19 from the HGNC track + hg38_coordinates = load_coordinates("hg38",hgnc_ids) + hg19_coordinates = load_coordinates("hg19",hgnc_ids) + print ("Loaded %s hg38 and %s hg19 HGNC IDs" % (len(hg38_coordinates), len(hg19_coordinates))) + # For hg19 and hg38, go through coordinates and add the G2P data + for db, coords in zip(database, [hg19_coordinates, hg38_coordinates]): + outputFile = "%s_g2p_all.bed" % db + outputBigBed = "%s_g2p.bb" % db + chromSizes = "/gbdb/%s/%s.2bit" % (db,db) + join_and_write(g2p_data, coords, outputFile) + print("Output written to %s" % outputFile) + # Create the bigBeds + bash("bedToBigBed -type=bed9+20 -tab -sort -as=./g2p.as -sizesIs2Bit -extraIndex=name,g2p_id,gene_mim,hgnc_id %s %s %s" % (outputFile,chromSizes,outputBigBed)) + + +###################### +# End of Python script + +########################################### +# Create symlinks + +ln -S /hive/data/outside/g2p/hg38_g2p.bb /gbdb/hg38/g2p/g2p.bb +ln -S /hive/data/outside/g2p/hg19_g2p.bb /gbdb/hg19/g2p/g2p.bb