src/hg/makeDb/scripts/tad/buildTadsEncode.py 17b7d3c37be41135afaf8e91e365e3847af96ca5

17b7d3c37be41135afaf8e91e365e3847af96ca5
lrnassar
  Mon Jun 22 10:56:56 2026 -0700
Add TAD (topologically associating domains) track set on hg19, hg38, mm10, mm39. refs #21599

New "tads" superTrack collecting published TAD calls, alpha-gated via include tad.ra
alpha in each assembly's trackDb.ra.

hg38 (all five sources): Dixon 2012 domains, Schmitt 2016 boundaries, McArthur & Capra
2021 boundary stability, ENCODE contact domains (faceted composite over 117 biosamples),
and 3D Genome Browser 2.0 domains (faceted composite over 464 datasets).
hg19: the three sources with hg19-compatible data (Dixon, Schmitt, McArthur).
mm10/mm39 (domains only; the boundary sources have no mouse data): Dixon, ENCODE
(faceted, 16 biosamples), and 3D Genome Browser (faceted, 30 datasets); mm39 lifted
from mm10, lift noted in the long labels.

Faceted composites are organ-colored from a TAD-owned organ_colors.json symlinked into
/gbdb/<asm>/bbi/tad/. Build scripts and autoSql are version-controlled under
makeDb/scripts/tad/ and symlinked into the per-source build dirs. Provenance and fetch
for every dataset are documented in the makedocs (doc/hg38/tad.txt, doc/mm10/tad.txt,
doc/mm39/tad.txt, and the hg19 TAD section in doc/hg19.txt).

diff --git src/hg/makeDb/scripts/tad/buildTadsEncode.py src/hg/makeDb/scripts/tad/buildTadsEncode.py
new file mode 100644
index 00000000000..a75ad07a72b
--- /dev/null
+++ src/hg/makeDb/scripts/tad/buildTadsEncode.py
@@ -0,0 +1,316 @@
+#!/usr/bin/env python3
+# Rebuild tadsEncode as a faceted bigBed composite across ALL 117 single-biosample
+# ENCODE contact-domain datasets (replaces the old 14-subtrack subGroup composite).
+# refs #21599
+#
+# ENCODE's contact-domain files are heterogeneous: the canonical product is the
+# 16-column Juicer/Arrowhead "blocks" bedpe (chr1 x1 x2 chr2 y1 y2 name score
+# strand1 strand2 color cornerScore uVarScore lVarScore upSign loSign), but the
+# pool also contains 12/11-col bedpe variants, 24-col loop files (different anchors,
+# no "chr" prefix), and 5-col hg19-lifted bed. We therefore tier the files and use
+# the best available per biosample:
+#   tier 16 : 16-col bedpe (full Arrowhead scores)            -> 111 biosamples
+#   tier 12 : 12-col bedpe (cornerScore only; others 0)       -> A549
+#   tier 11 : 11-col bedpe (no scores; all 0)
+#   tier  5 : hg19-lifted bed5 (no scores; all 0)             -> 5 biosamples
+# Loop files (anchors differ, x!=y) and non-hg38 chroms are dropped by the parser.
+#
+# Per biosample: pick one representative experiment (best file tier; preferred_default;
+# assay priority intact>in situ>dilution; then total size). Pool that experiment's
+# files of the chosen tier (union; drop a domain whose endpoints both fall within one
+# 5 kb bin of a higher-cornerScore domain). Write bigBed 4+5 (tadDomainEncode).
+# Conversion verified vs the retired tadsEncodeGM12878.bb (scores taken verbatim;
+# duplicate resolution keeps the higher cornerScore).
+
+import json, os, re, gzip, subprocess, sys
+from collections import defaultdict, Counter
+
+# Assembly-aware: `python3 buildTadsEncode.py [hg38|mm10]` (default hg38, the original human build).
+# mm10 is the native mouse build; mm39 is produced by lifting the mm10 bigBeds (see liftEncodeMouse.sh).
+ASM = sys.argv[1] if len(sys.argv) > 1 else "hg38"
+_HUMAN = "/hive/users/lrnassar/claude/RM21599/encode"
+_MOUSE = "/hive/data/outside/tad/encode/mouse"
+CFG = {
+ "hg38": dict(src=_HUMAN+"/contact_domains",
+              manif="/hive/data/outside/tad/encode/source/_cd_select.json",
+              meta="/hive/data/outside/tad/encode/source/encode_meta_all.json",
+              pert="/hive/data/outside/tad/encode/source/encode_perturbed.json",
+              chroms="/hive/data/genomes/hg38/chrom.sizes", out="/hive/data/outside/tad/encode/build/hg38",
+              species="human",
+              default_on={"GM12878","K562","HepG2","HCT116","IMR-90","A549","HL-60/S4",
+                "heart left ventricle","heart right ventricle","dorsolateral prefrontal cortex",
+                "ovary","pancreas","transverse colon","motor neuron"}),
+ "mm10": dict(src=_MOUSE+"/contact_domains", manif=_MOUSE+"/source/_cd_select.json",
+              meta=_MOUSE+"/source/encode_meta_all.json", pert=_MOUSE+"/source/encode_perturbed.json",
+              chroms="/hive/data/genomes/mm10/chrom.sizes", out="/hive/data/outside/tad/encode/build/mm10",
+              species="mouse",
+              default_on={"mouse embryonic stem cell","CH12.LX","heart","left cerebral cortex"}),
+}
+C = CFG[ASM]
+SRC=C["src"]; MANIF=C["manif"]; META=C["meta"]; PERT=C["pert"]; CHROMS=C["chroms"]
+SPECIES=C["species"]; DEFAULT_ON=C["default_on"]
+COLORS = "/hive/data/outside/tad/organ_colors.json"   # TAD-owned (symlinked into /gbdb/<asm>/bbi/tad/)
+ASFILE = "/hive/data/outside/tad/tadDomainEncode.as"
+GBDB   = "/gbdb/%s/bbi/tad" % ASM
+OUT    = C["out"]
+BBDIR  = os.path.join(OUT, "tadsEncode")
+TSV    = os.path.join(OUT, "tadsEncode_metadata.tsv")
+RA     = os.path.join(OUT, "tadsEncode.ra")
+BIN    = 5000
+
+ASSAY_PRIORITY = {"intact Hi-C":0, "in situ Hi-C":1, "dilution Hi-C":2}
+
+SLIM2KEY = {
+ 'bodily fluid':'Blood','blood':'Blood','epithelium':'Epithelium',
+ 'heart':'Heart','pericardium':'Heart','lung':'Lung',
+ 'vasculature':'Blood vessel','blood vessel':'Blood vessel',
+ 'arterial blood vessel':'Blood vessel','vein':'Blood vessel',
+ 'colon':'Colon','large intestine':'Large intestine','intestine':'Large intestine',
+ 'brain':'Brain','spinal cord':'Spinal cord','nerve':'Nerve','eye':'Eye',
+ 'musculature of body':'Muscle','skin of body':'Skin','limb':'Limb',
+ 'mammary gland':'Breast','breast':'Breast',
+ 'embryo':'Embryo','extraembryonic component':'Placenta','placenta':'Placenta',
+ 'kidney':'Kidney','liver':'Liver','pancreas':'Pancreas','esophagus':'Esophagus','stomach':'Stomach',
+ 'connective tissue':'Connective tissue','bone marrow':'Bone marrow','bone element':'Bone',
+ 'prostate gland':'Prostate','uterus':'Uterus','vagina':'Vagina','ovary':'Ovary','testis':'Testis',
+ 'adrenal gland':'Adrenal gland','thyroid gland':'Thyroid','spleen':'Spleen',
+ 'immune organ':'Lymphoid tissue','lymph node':'Lymphoid tissue',
+ 'exocrine gland':'Epithelium','endocrine gland':'Epithelium','gonad':'Epithelium',
+}
+PRIORITY = ['Heart','Lung','Brain','Liver','Kidney','Pancreas','Prostate','Breast','Ovary','Testis',
+ 'Uterus','Vagina','Cervix','Stomach','Esophagus','Colon','Large intestine','Small intestine',
+ 'Spleen','Thyroid','Adrenal gland','Eye','Nerve','Spinal cord','Skin','Blood vessel','Muscle',
+ 'Bone marrow','Bone','Connective tissue','Placenta','Limb','Embryo','Lymphoid tissue',
+ 'Blood','Epithelium']
+PRANK = {k:i for i,k in enumerate(PRIORITY)}
+ORGAN_OVERRIDE = {'GM11168':'Blood','Calu3':'Lung','Ramos':'Blood',
+                  'activated CD8-positive, alpha-beta T cell':'Blood',
+                  # mouse biosamples with empty organ_slims (else fall back to Epithelium):
+                  'CH12F3':'Blood','v-Abl transformed pro-B cells':'Blood',
+                  'mouse trophoblast stem cell':'Placenta'}
+CLASS_LABEL = {'tissue':'Tissue','cell line':'Cell line','primary cell':'Primary cell',
+               'in vitro differentiated cells':'In vitro differentiated'}
+
+def pick_organ(slims, term):
+    keys=[SLIM2KEY[s] for s in slims if s in SLIM2KEY]
+    return ORGAN_OVERRIDE.get(term,'Epithelium') if not keys else min(keys,key=lambda k:PRANK.get(k,999))
+
+def clean(s):
+    s="" if s is None else str(s); return re.sub(r"\s+"," ",s.replace("\t"," ")).strip()
+def scap(s):
+    """Sentence-case: capitalize the first letter only if the first word is all-lowercase
+    (so acronyms/proper tokens like hTERT, GM12878, CD4+, Hi-C are preserved)."""
+    if not s: return s
+    first=s.split(" ",1)[0]
+    if first[:1].isalpha() and first==first.lower(): return s[:1].upper()+s[1:]
+    return s
+def hex2rgb(h):
+    h=h.lstrip("#"); return "%d,%d,%d"%(int(h[0:2],16),int(h[2:4],16),int(h[4:6],16))
+
+# ASCII-safe abbreviations for long ENCODE biosample names, applied before truncation
+ENC_ABBR=[(r'CD4-positive','CD4+'),(r'CD8-positive','CD8+'),(r'CD34-positive','CD34+'),
+          (r'CD14-positive','CD14+'),(r'thymus-derived ',''),(r', alpha-beta',''),
+          (r' myocardium',''),(r'-positive','+'),(r'-negative','-')]
+STOP={"the","of","a","an","in","on","to","and","for","with","at","by"}
+def shortlab(name, maxlen=22, abbr=None):
+    """Concise, sensical shortLabel: abbreviate, truncate only at whole-word boundaries
+    (never mid-word), drop a trailing connective word/punctuation."""
+    s=name.replace("_"," ")
+    for pat,rep in (abbr or []): s=re.sub(pat,rep,s)
+    s=re.sub(r"\s+"," ",s).strip()
+    if " of " in s.lower():        # "A of B" -> organ-first "B A" (avoids dangling-preposition cuts)
+        i=s.lower().find(" of "); head,tail=s[:i].strip(),s[i+4:].strip()
+        if head and tail:
+            if head[:1].isupper() and head[1:2].islower(): head=head[0].lower()+head[1:]
+            s=tail+" "+head
+    if len(s)>maxlen:
+        out=""
+        for t in s.split(" "):
+            if len((out+" "+t).strip())<=maxlen: out=(out+" "+t).strip()
+            else: break
+        s=out if out else s[:maxlen]
+    toks=s.split(" ")
+    while len(toks)>1 and toks[-1].lower().strip(",;-") in STOP: toks.pop()
+    return " ".join(toks).rstrip(" ,;-")
+
+CHROMSZ={l.split('\t')[0]: int(l.split('\t')[1]) for l in open(CHROMS)}
+
+def file_tier(acc, fmt):
+    """First domain row's column count -> tier; 0 if not usable."""
+    p=os.path.join(SRC, acc+('.bedpe.gz' if fmt=='bedpe' else '.bed.gz'))
+    if not os.path.exists(p): return 0,None
+    with gzip.open(p,'rt') as fh:
+        for line in fh:
+            if line.startswith('#'): continue
+            c=line.rstrip('\n').split('\t')
+            if len(c)>1 and c[1]=='x1': continue
+            n=len(c)
+            if fmt=='bed' and n>=5: return 5,p
+            # exact column counts only: 16-col Arrowhead blocks (full scores), 12/11-col
+            # variants. 24-col files are loops (different anchors) -> tier 0, excluded.
+            if fmt=='bedpe' and n==16: return 16,p
+            if fmt=='bedpe' and n==12: return 12,p
+            if fmt=='bedpe' and n==11: return 11,p
+            return 0,p
+    return 0,p
+
+def parse_domains(path, fmt, tier, name):
+    """Yield bed4+5 rows: domain intervals only, chrom-normalized, scores per tier."""
+    out=[]
+    with gzip.open(path,'rt') as fh:
+        for line in fh:
+            if line.startswith('#'): continue
+            c=line.rstrip('\n').split('\t')
+            if len(c)<3 or c[1]=='x1': continue
+            ch=c[0] if c[0].startswith('chr') else 'chr'+c[0]
+            if ch not in CHROMSZ: continue
+            if fmt=='bedpe':
+                if len(c)<6: continue
+                # domain row: same chrom both anchors, same interval (x==y); else it's a loop
+                if c[1]!=c[4] or c[2]!=c[5]: continue
+                s,e=c[1],c[2]
+                if tier==16: sc=[c[11],c[12],c[13],c[14],c[15]]
+                elif tier==12: sc=[c[11],"0","0","0","0"]
+                else: sc=["0","0","0","0","0"]
+            else:  # bed5
+                s,e=c[1],c[2]; sc=["0","0","0","0","0"]
+            si,ei=int(s),int(e)
+            if si<0 or ei>CHROMSZ[ch] or si>=ei: continue   # drop out-of-bounds/degenerate
+            out.append([ch,si,ei,name]+sc+[float(sc[0])])
+    return out
+
+def main():
+    os.makedirs(BBDIR, exist_ok=True)
+    recs=json.load(open(MANIF)); recs=recs if isinstance(recs,list) else recs.get("@graph",recs)
+    meta=json.load(open(META)); colors=json.load(open(COLORS))["Organ"]
+    pert=json.load(open(PERT))   # experiment accession -> {perturbed, summary}
+
+    bybs=defaultdict(lambda: defaultdict(list))
+    for r in recs:
+        bo=r.get("biosample_ontology")
+        if not isinstance(bo,dict): continue
+        if "/aggregate" in str(r.get("dataset","")): continue
+        bybs[bo["term_name"]][r["dataset"]].append(r)
+
+    # annotate each file with its tier + path (cache)
+    for term,exps in bybs.items():
+        for ds,fs in exps.items():
+            for f in fs:
+                f["_tier"],f["_path"]=file_tier(f["accession"],f["file_format"])
+
+    def exp_rank(ds, efiles):
+        besttier=max((f["_tier"] for f in efiles), default=0)
+        encsr=ds.strip("/").split("/")[-1]
+        perturbed=1 if pert.get(encsr,{}).get("perturbed") else 0  # prefer unperturbed
+        has_pref=any(f.get("preferred_default") for f in efiles)
+        assay=min((ASSAY_PRIORITY.get(f.get("assay_title"),9) for f in efiles), default=9)
+        size=sum(f.get("file_size",0) for f in efiles if f["_tier"]==besttier)
+        return (-besttier, perturbed, 0 if has_pref else 1, assay, -size)
+
+    subtracks=[]; fail=[]
+    for term,exps in bybs.items():
+        exp,efiles=min(exps.items(), key=lambda kv:(exp_rank(kv[0],kv[1]), kv[0]))
+        encsr=exp.strip("/").split("/")[-1]
+        tier=max(f["_tier"] for f in efiles)
+        if tier==0: fail.append((term,"no usable file")); continue
+        use=[f for f in efiles if f["_tier"]==tier]
+        fmt=use[0]["file_format"]
+        assay=scap(sorted(use,key=lambda f:ASSAY_PRIORITY.get(f.get("assay_title"),9))[0].get("assay_title"))
+        dterm=scap(term)   # sentence-cased display name (term kept original for logic/lookups)
+
+        rows=[]
+        for f in use:
+            rows+=parse_domains(f["_path"], fmt, tier, dterm)
+        if not rows: fail.append((term,"no domain rows")); continue
+
+        kept=[]; bychrom=defaultdict(list)
+        for r in rows: bychrom[r[0]].append(r)
+        for ch,rs in bychrom.items():
+            rs.sort(key=lambda r:-r[-1]); acc=[]
+            for r in rs:
+                if any(abs(r[1]-k[1])<=BIN and abs(r[2]-k[2])<=BIN for k in acc): continue
+                acc.append(r)
+            kept.extend(acc)
+
+        symbol=re.sub("[^A-Za-z0-9]","",term)
+        m=meta.get(encsr,{})
+        organ=pick_organ(m.get("organ_slims",[]),term)
+        if organ not in colors: organ="Epithelium"
+        rgb=hex2rgb(colors[organ])
+        btype=CLASS_LABEL.get(m.get("classification",""),"Unknown")
+        ls=m.get("life_stage",[]); life="Unknown" if not ls else (ls[0].capitalize() if len(ls)==1 else "Mixed")
+        lifted=(tier==5)
+        calls="Lifted from hg19" if lifted else "Arrowhead (%s)"%ASM
+        summary=clean(m.get("summary")) or term
+
+        bb=os.path.join(BBDIR,symbol+".bb"); tmp=os.path.join(BBDIR,symbol+".bed")
+        kept.sort(key=lambda r:(r[0],r[1],r[2]))
+        with open(tmp,"w") as o:
+            for r in kept: o.write("\t".join([r[0],str(r[1]),str(r[2]),r[3],r[4],r[5],r[6],r[7],r[8]])+"\n")
+        rc=subprocess.run(["bedToBigBed","-type=bed4+5","-tab","-as="+ASFILE,tmp,CHROMS,bb],
+                          stderr=subprocess.PIPE,text=True)
+        os.remove(tmp)
+        if rc.returncode!=0: fail.append((term,rc.stderr.strip())); continue
+
+        subtracks.append({"symbol":symbol,"term":term,"dterm":dterm,"organ":organ,"rgb":rgb,"encsr":encsr,
+            "assay":assay,"btype":btype,"life":life,"calls":calls,
+            "summary":summary,"lifted":lifted,"tier":tier,"n":len(kept),
+            "on":("on" if term in DEFAULT_ON else "off")})
+
+    if fail:
+        print("FAILURES (%d):"%len(fail),file=sys.stderr)
+        for t,e in fail: print("  ",t,e,file=sys.stderr)
+    print("built %d / %d biosamples"%(len(subtracks),len(bybs)))
+    print("default-on:",sum(1 for s in subtracks if s["on"]=="on"))
+    print("tiers:",Counter(s["tier"] for s in subtracks))
+
+    # primaryKey = the representative ENCODE experiment accession (linked via subtrackUrls,
+    # like the wgEncodeReg4 reference). _Biosample (readable name) and _Description (the full
+    # ENCODE biosample summary) are shown in the metadata table but excluded from faceting.
+    cols=["Accession","_Biosample","Organ","Biosample_type","Assay","Life_stage","Calls","_Description"]
+    with open(TSV,"w") as fh:
+        fh.write("\t".join(cols)+"\n")
+        for s in sorted(subtracks,key=lambda s:s["encsr"]):
+            fh.write("\t".join([s["encsr"],s["dterm"],s["organ"],s["btype"],s["assay"],
+                                s["life"],s["calls"],s["summary"]])+"\n")
+
+    with open(RA,"w") as fh:
+        fh.write(
+"""track tadsEncode
+parent tads
+priority 2
+compositeTrack faceted
+shortLabel ENCODE TADs
+longLabel ENCODE contact domains (TADs) across %d %s biosamples (Arrowhead/Hi-C)
+type bigBed 4 + 5
+group regulation
+visibility pack
+metaDataUrl %s/tadsEncode_metadata.tsv
+primaryKey Accession
+colorSettingsUrl %s/organ_colors.json
+subtrackUrls Accession=https://www.encodeproject.org/experiments/$$/
+maxCheckboxes 50
+html tadsEncode
+
+""" % (len(subtracks), SPECIES, GBDB, GBDB))
+        for s in sorted(subtracks,key=lambda s:(s["on"]!="on", s["term"].lower())):
+            short=scap(shortlab(s["term"],22,ENC_ABBR))
+            long="ENCODE TADs in %s (%s)"%(s["dterm"],s["encsr"])
+            mouse=("Biosample: %s | TAD lifted from hg19 (no Arrowhead score)"%s["dterm"] if s["lifted"]
+                   else "Biosample: %s | Arrowhead corner score: $cornerScore"%s["dterm"])
+            fh.write(
+"""    track tadsEncode_%s
+    parent tadsEncode %s
+    shortLabel %s
+    longLabel %s
+    type bigBed 4 + 5
+    bigDataUrl %s/tadsEncode/%s.bb
+    color %s
+    visibility dense
+    mouseOver %s
+
+"""%(s["encsr"],s["on"],short,long,GBDB,s["symbol"],s["rgb"],mouse))
+    print("wrote %d subtracks -> %s"%(len(subtracks),RA))
+
+if __name__=="__main__":
+    main()