17b7d3c37be41135afaf8e91e365e3847af96ca5 lrnassar Mon Jun 22 10:56:56 2026 -0700 Add TAD (topologically associating domains) track set on hg19, hg38, mm10, mm39. refs #21599 New "tads" superTrack collecting published TAD calls, alpha-gated via include tad.ra alpha in each assembly's trackDb.ra. hg38 (all five sources): Dixon 2012 domains, Schmitt 2016 boundaries, McArthur & Capra 2021 boundary stability, ENCODE contact domains (faceted composite over 117 biosamples), and 3D Genome Browser 2.0 domains (faceted composite over 464 datasets). hg19: the three sources with hg19-compatible data (Dixon, Schmitt, McArthur). mm10/mm39 (domains only; the boundary sources have no mouse data): Dixon, ENCODE (faceted, 16 biosamples), and 3D Genome Browser (faceted, 30 datasets); mm39 lifted from mm10, lift noted in the long labels. Faceted composites are organ-colored from a TAD-owned organ_colors.json symlinked into /gbdb//bbi/tad/. Build scripts and autoSql are version-controlled under makeDb/scripts/tad/ and symlinked into the per-source build dirs. Provenance and fetch for every dataset are documented in the makedocs (doc/hg38/tad.txt, doc/mm10/tad.txt, doc/mm39/tad.txt, and the hg19 TAD section in doc/hg19.txt). diff --git src/hg/makeDb/scripts/tad/buildTads3dgb.py src/hg/makeDb/scripts/tad/buildTads3dgb.py new file mode 100644 index 00000000000..d0861fbd556 --- /dev/null +++ src/hg/makeDb/scripts/tad/buildTads3dgb.py @@ -0,0 +1,222 @@ +#!/usr/bin/env python3 +# Build the tads3dgb faceted bigBed composite: all 464 human hg38 3D Genome Browser +# TAD-domain datasets, one subtrack each, faceted by metadata, organ-colored. +# Literal data: only a format normalization (10-col 3DGB bed -> bed4). No re-calling, +# merging, lifting, or recurrence scoring. refs #21599 +# +# Reads: 3DGB source beds + datasets_api.json + classification.tsv (project dir) +# organ_colors.json (TAD-owned /hive/data/outside/tad copy; has Colon/Bladder/Cervix added) +# Writes: build/hg38/tads3dgb/.bb, tads3dgb_metadata.tsv, tads3dgb.ra + +import json, csv, os, subprocess, sys, collections, re + +# Assembly-aware: `python3 buildTads3dgb.py [hg38|mm10]` (default hg38). hg38 iterates the human +# classification.tsv (which carries the curated Condition/Treatment/Provenance facets); mm10 iterates +# the mouse TAD-called datasets straight from the API (no curated facets -> those 3 are omitted). +# mm39 is produced by lifting the mm10 bigBeds (see liftTads3dgbMouse.sh). +ASM = sys.argv[1] if len(sys.argv) > 1 else "hg38" +APIJS = "/hive/users/lrnassar/claude/RM21599/3dgenome/datasets_api.json" +CLASS = "/hive/users/lrnassar/claude/RM21599/3dgenome/classification.tsv" +COLORS = "/hive/data/outside/tad/organ_colors.json" # TAD-owned (symlinked into /gbdb//bbi/tad/) +ASCFG = { + "hg38": dict(beds="/hive/users/lrnassar/claude/RM21599/3dgenome/tad_beds", + chroms="/hive/data/genomes/hg38/chrom.sizes", + out="/hive/data/outside/tad/3dgenome/build/hg38", species="human", + default_on={"63","67","122","103"}), # GM12878, H1-ESC, IMR-90, HMEC + "mm10": dict(beds="/hive/data/outside/tad/3dgenome/mouse/tad_beds", + chroms="/hive/data/genomes/mm10/chrom.sizes", + out="/hive/data/outside/tad/3dgenome/build/mm10", species="mouse", + default_on=set()), # mouse ships all-off (per user) +} +C = ASCFG[ASM] +BEDS=C["beds"]; CHROMS=C["chroms"]; OUT=C["out"]; SPECIES=C["species"]; DEFAULT_ON=C["default_on"] +HAS_CTP = (ASM == "hg38") # Condition/Treatment/Provenance facets (human curation only) +GBDB = "/gbdb/%s/bbi/tad" % ASM +BBDIR = os.path.join(OUT, "tads3dgb") +TSV = os.path.join(OUT, "tads3dgb_metadata.tsv") +RA = os.path.join(OUT, "tads3dgb.ra") + +# 3DGB organ value -> organ_colors.json key (case/vocabulary normalization). +# The 5 case-only fixes map to existing keys; Colon/Bladder/Cervix were added to the JSON. +ORGAN_NORM = { + "Blood Vessel": "Blood vessel", + "Lymphoid Tissue": "Lymphoid tissue", + "Small Intestine": "Small intestine", + "Adrenal Gland": "Adrenal gland", + "Connective Tissue": "Connective tissue", +} + +def hex2rgb(h): + h = h.lstrip("#") + return "%d,%d,%d" % (int(h[0:2],16), int(h[2:4],16), int(h[4:6],16)) + +def clean(s): + """Collapse whitespace; never return empty (faceted JS throws on blank cells).""" + s = "" if s is None else str(s) + s = re.sub(r"\s+", " ", s.replace("\t", " ")).strip() + return s + +STOP = {"the","of","a","an","in","on","to","and","for","with","at","by"} +def shortlab(name, maxlen=22): + """A concise, sensical shortLabel: underscores->spaces, truncate only at whole-word + boundaries (never mid-word), and drop a trailing connective word/punctuation.""" + s = re.sub(r"\s+", " ", name.replace("_", " ")).strip() + if " of " in s.lower(): # "A of B" -> organ-first "B A" (avoids dangling-preposition cuts) + i = s.lower().find(" of "); head, tail = s[:i].strip(), s[i+4:].strip() + if head and tail: + if head[:1].isupper() and head[1:2].islower(): + head = head[0].lower() + head[1:] + s = tail + " " + head + if len(s) > maxlen: + out = "" + for t in s.split(" "): + if len((out + " " + t).strip()) <= maxlen: + out = (out + " " + t).strip() + else: + break + s = out if out else s[:maxlen] + toks = s.split(" ") + while len(toks) > 1 and toks[-1].lower().strip(",;-") in STOP: + toks.pop() + return " ".join(toks).rstrip(" ,;-") + +def scap(s): + """Sentence-case: capitalize the first letter only if the first word is all-lowercase + (so acronyms/proper tokens like GM12878, Hi-C, RAD21 are preserved).""" + if not s: + return s + first = s.split(" ", 1)[0] + if first[:1].isalpha() and first == first.lower(): + return s[:1].upper() + s[1:] + return s + +def main(): + os.makedirs(BBDIR, exist_ok=True) + + api = {str(r["id"]): r for r in json.load(open(APIJS))} + colors = json.load(open(COLORS))["Organ"] + + if ASM == "hg38": + rows = [] + with open(CLASS) as fh: + for r in csv.DictReader(fh, delimiter="\t"): + rows.append({k: (v.strip() if v else "") for k, v in r.items()}) + else: + # mm10: the iteration source is the set of mouse TAD-called datasets (one .bed per + # dataset in BEDS); all per-dataset metadata (year, refNo, organ, ...) comes from the API. + have = sorted(int(f[:-4]) for f in os.listdir(BEDS) if f.endswith(".bed")) + rows = [{"id": str(i), "bed": "%d.bed" % i} for i in have] + + meta = [] # (id, dict of tsv cols, color rgb, shortLabel, longLabel, mouseOver) + missing_color = collections.Counter() + fail = [] + + for r in rows: + did = r["id"] + bed = os.path.join(BEDS, r["bed"]) + if not os.path.exists(bed): + fail.append((did, "missing bed")); continue + a = api.get(did, {}) + + organ_raw = clean(a.get("organ")) or "Unknown" + organ = ORGAN_NORM.get(organ_raw, organ_raw) + if organ not in colors: + missing_color[organ] += 1 + rgb = hex2rgb(colors.get(organ, "#000000")) + + name = clean(a.get("name")) or ("dataset " + did) + cell = scap(clean(a.get("cellType"))) or "(unspecified)" + assay = scap(clean(a.get("dataType"))) or "Unknown" + cond = {"normal": "Normal", "cancer": "Cancer"}.get(r.get("normal_cancer",""), "Unknown") + treat = {"baseline": "Baseline", "pert": "Perturbation"}.get(r.get("baseline_pert",""), "Unknown") + prov = {"have": "Also in another UCSC track", "novel": "Novel to browser"}.get(r.get("have_novel",""), "Unknown") + yr = r.get("year") or a.get("year", "") # classification (human) or API (mouse) + try: + year = str(int(float(yr))) if yr else "Unknown" + except ValueError: + year = "Unknown" + study = clean(r.get("refNo") or a.get("refNo")) or "Unknown" + desc = clean(a.get("description")) or name + + # display name: underscores -> spaces, sentence-cased (logic still keys on id/name) + dname = scap(name.replace("_", " ")) + short = scap(shortlab(name)) + long = "%s TAD domains (%s, %s, %s)" % (dname, organ, assay, study) + mouse = "3DGB TAD domain: %s (%s, %s)" % (dname, organ, assay) + + # bed4: format-only. sort -k1,1 -k2,2n, then chrom/start/end + display name (-tab: name has spaces). + bb = os.path.join(BBDIR, did + ".bb") + tmp = os.path.join(BBDIR, did + ".bed4") + with open(tmp, "w") as out: + p1 = subprocess.run(["sort", "-k1,1", "-k2,2n", bed], + stdout=subprocess.PIPE, check=True, text=True) + for line in p1.stdout.splitlines(): + f = line.split("\t") + if len(f) < 3: + continue + out.write("%s\t%s\t%s\t%s\n" % (f[0], f[1], f[2], dname)) + rc = subprocess.run(["bedToBigBed", "-type=bed4", "-tab", tmp, CHROMS, bb], + stderr=subprocess.PIPE, text=True) + os.remove(tmp) + if rc.returncode != 0: + fail.append((did, rc.stderr.strip())); continue + od = [("DatasetId", did), ("Organ", organ), ("Cell_type", cell), ("Assay", assay)] + if HAS_CTP: # human-only curated facets + od += [("Condition", cond), ("Treatment", treat), ("Provenance", prov)] + od += [("Year", year), ("Study", study), ("_Description", desc)] + meta.append((did, collections.OrderedDict(od), rgb, short, long, mouse)) + + if fail: + print("FAILURES (%d):" % len(fail), file=sys.stderr) + for d, e in fail[:20]: + print(" ", d, e, file=sys.stderr) + if missing_color: + print("ORGANS WITHOUT COLOR KEY:", dict(missing_color), file=sys.stderr) + print("converted %d / %d datasets" % (len(meta), len(rows))) + + # metadata TSV (tab-sep; primaryKey + facet cols + hidden _Description) + cols = list(meta[0][1].keys()) + with open(TSV, "w") as fh: + fh.write("\t".join(cols) + "\n") + for _, m, *_ in meta: + fh.write("\t".join(m[c] for c in cols) + "\n") + + # faceted composite stanza + with open(RA, "w") as fh: + fh.write( +"""track tads3dgb +parent tads +priority 3 +compositeTrack faceted +shortLabel 3D Genome Browser +longLabel TAD domains across %d %s Hi-C/Micro-C datasets (3D Genome Browser 2.0) +type bigBed 4 +group regulation +visibility hide +metaDataUrl %s/tads3dgb_metadata.tsv +primaryKey DatasetId +colorSettingsUrl %s/organ_colors.json +maxCheckboxes 50 +html tads3dgb + +""" % (len(meta), SPECIES, GBDB, GBDB)) + for did, m, rgb, short, long, mouse in meta: + onoff = "on" if did in DEFAULT_ON else "off" + fh.write( +""" track tads3dgb_%s + parent tads3dgb %s + shortLabel %s + longLabel %s + type bigBed 4 + bigDataUrl %s/tads3dgb/%s.bb + color %s + visibility dense + mouseOver %s + +""" % (did, onoff, short, long, GBDB, did, rgb, mouse)) + + print("wrote %d subtracks -> %s" % (len(meta), RA)) + print("metadata -> %s" % TSV) + +if __name__ == "__main__": + main()