66604eefdbb0971ed43ee81bd6f18cc3a409e304 markd Sat Jan 10 13:18:44 2026 -0800 color-code clsLongReadRna modles (#36908) diff --git src/hg/makeDb/outside/clsLongReadRna/clsTrackTool src/hg/makeDb/outside/clsLongReadRna/clsTrackTool index 7daf3633cc9..88a4ae34e36 100755 --- src/hg/makeDb/outside/clsLongReadRna/clsTrackTool +++ src/hg/makeDb/outside/clsLongReadRna/clsTrackTool @@ -1,53 +1,56 @@ #!/usr/bin/env python3 import sys import os.path as osp import re import argparse import glob -from collections import namedtuple, defaultdict +from collections import namedtuple, defaultdict, Counter + ### # DO NOT CODE REVIEW THIS. IT WILL NEVER BE USED AGAIN. SPEND OUR FUNDING ON # SOMETHING IMPORTANT OR JUST REVIEW THE TRACKS. ### sys.path.insert(0, '/hive/groups/browser/pycbio/lib') from pycbio.sys import cmdOps, fileOps from pycbio.tsv import TsvReader +from pycbio.hgdata.autoSql import strArraySplit from pycbio.hgdata.bed import Bed, BedReader from pycbio.hgdata.autoSql import strArrayJoin def parseArgs(): desc = """convert CLS BED and attributes to a BED with attributes. Input files create with: zcat Hv3_masterTable_refined.gtf.gz | gffread -F | gff3ToGenePred -attrsOut=Hv3_masterTable_refined.attrs.tab stdin stdout |\\ genePredToBed stdin stdout | tawk '{$7=$2; print}' | sort -k1,1 -k2,2n |gzip -9 > Hv3_masterTable_refined.bed.gz CLS metadata documented at: https://github.com/guigolab/gencode-cls-master-table other info: https://github.com/guigolab/CLS3_GENCODE """ parser = argparse.ArgumentParser(description=desc) subparsers = parser.add_subparsers(dest="command", required=True) # make-beds subcommand beds_parser = subparsers.add_parser("make-beds") beds_parser.add_argument("clsMetadataTab") beds_parser.add_argument("masterBedFile") beds_parser.add_argument("transcriptMetaTsv") + beds_parser.add_argument("clsGencodeAssocTsv") beds_parser.add_argument("trackDir") # make-trackdb subcommand trackdb_parser = subparsers.add_parser("make-trackdb") trackdb_parser.add_argument("--parent") trackdb_parser.add_argument("clsMetadataTab") trackdb_parser.add_argument("trackDir") trackdb_parser.add_argument("trackDb") # check-bams subcommand check_parser = subparsers.add_parser("check-bams") check_parser.add_argument("clsMetadataTab") check_parser.add_argument("trackDir") return cmdOps.parse(parser) @@ -242,87 +245,158 @@ return clsMetaToSampleDesc(clsMeta, short) + " " + platform + " " + capture ## # GTF meta ## def parseSidList(sids): return tuple(sorted(sids.split(','))) def loadTranscriptMeta(transcriptMetaTsv): # transcriptId samplesMetadata artifact typeMap = { "samplesMetadata": parseSidList } return {row.transcriptId: row for row in TsvReader(transcriptMetaTsv, typeMap=typeMap)} +## +# CLS to GENCODE +## +class ClsGencodeAssoc: + def __init__(self): + self.by_anchTM = dict() + + def add(self, row, modelRefCounts): + # Some anchTM are duplicated in the give row, + # so sort to remove them. + for anchTM in sorted(set(row.CLS3_anchTM)): + self.by_anchTM[anchTM] = row + modelRefCounts[anchTM] += 1 + + def get(self, anchTM): + return self.by_anchTM.get(anchTM) + +def reportDupModelRefs(modelRefCounts): + multiCnt = 0 + for cnt in modelRefCounts.values(): + if cnt > 1: + multiCnt += 1 + if multiCnt > 0: + print(f"Note: {multiCnt} or {len(modelRefCounts)} models are mapped to multiple GENCODE transcripts", + file=sys.stderr) + + +def loadClsGencodeAssoc(clsGencodeAssocTsv): + clsGencodeAssoc = ClsGencodeAssoc() + # map to common column names + columnNameMap = { + "geneID_v47": "geneID", + "transcriptID_v47": "transcriptID", + "v47_biotype": "biotype", + "geneID_vM36": "geneID", + "transcriptID_vM36": "transcriptID", + "vM36_biotype": "biotype", + } + + typeMap = {"CLS3_anchTM": strArraySplit} + + # a small number are in multiple times, which we report, + # but only track one + modelRefCounts = Counter() + for row in TsvReader(clsGencodeAssocTsv, typeMap=typeMap, + columnNameMapper=lambda c: columnNameMap.get(c, c)): + clsGencodeAssoc.add(row, modelRefCounts) + reportDupModelRefs(modelRefCounts) + return clsGencodeAssoc ## # BED generation ## +colorMap = {"protein_coding": "12,12,120", + "lncRNA": "0,100,0", + "transcribed_unprocessed_pseudogene": "255,51,255", + "TEC": "254,0,0", + "": "255,160,122"} + + +def getBiotypeColor(gencodeBioType): + return colorMap[gencodeBioType] + def allModelsFile(trackDir, ext): return osp.join(trackDir, "cls-models" + ext) -def makeTranscriptBed(bed, clsMetaBySid, transcriptMeta, allBeds, bedsBySid, - bedsByStageTissue): +def updateTranscriptBed(bed, clsMetaBySid, transcriptMeta, clsGencodeAssoc, + allBeds, bedsBySid, bedsByStageTissue): + # must match clsModelBed.as sids = transcriptMeta.samplesMetadata sampleDesc = [clsMetaToDesc(clsMetaBySid[sid]) for sid in sids] - bed.extraCols = [len(sids), strArrayJoin(sampleDesc), strArrayJoin(sids)] - + gencodeGeneId = gencodeTranscriptId = gencodeBioType = "" + gencodeMeta = clsGencodeAssoc.get(bed.name) + if gencodeMeta is not None: + gencodeGeneId = gencodeMeta.geneID + gencodeTranscriptId = gencodeMeta.transcriptID + gencodeBioType = gencodeMeta.biotype + + inGencode = "yes" if gencodeGeneId != "" else "no" + + bed.extraCols = [len(sids), strArrayJoin(sampleDesc), strArrayJoin(sids), + inGencode, gencodeGeneId, gencodeTranscriptId, gencodeBioType] + bed.itemRgb = getBiotypeColor(gencodeBioType) allBeds.append(bed) for sid in sids: bedsBySid[sid].append(bed) stageTissues = set([stageTissueName(clsMetaBySid[sid]) for sid in sids]) for stageTissue in stageTissues: bedsByStageTissue[stageTissue].append(bed) -def processTranscript(bed, clsMetaBySid, transcriptMetas, allBeds, - bedsBySid, bedsByStageTissue): +def processTranscript(bed, clsMetaBySid, transcriptMetas, clsGencodeAssoc, + allBeds, bedsBySid, bedsByStageTissue): transcriptMeta = transcriptMetas[bed.name] if transcriptMeta.artifact == 'no': - makeTranscriptBed(bed, clsMetaBySid, transcriptMeta, allBeds, bedsBySid, - bedsByStageTissue) + updateTranscriptBed(bed, clsMetaBySid, transcriptMeta, clsGencodeAssoc, + allBeds, bedsBySid, bedsByStageTissue) -def buildBeds(masterBedFile, clsMetaBySid, transcriptMetas): +def buildBeds(masterBedFile, clsMetaBySid, transcriptMetas, clsGencodeAssoc): allBeds = [] bedsBySid = defaultdict(list) bedsByStageTissue = defaultdict(list) for bed in BedReader(masterBedFile): - processTranscript(bed, clsMetaBySid, transcriptMetas, allBeds, bedsBySid, - bedsByStageTissue) + processTranscript(bed, clsMetaBySid, transcriptMetas, clsGencodeAssoc, + allBeds, bedsBySid, bedsByStageTissue) return allBeds, bedsBySid, bedsByStageTissue def write_bed_file(beds, bedfile): beds.sort(key=Bed.genome_sort_key) fileOps.ensureFileDir(bedfile) with fileOps.opengz(bedfile, 'w') as fh: for bed in beds: bed.write(fh) def write_bed_files(clsMetaBySid, allBeds, bedsBySid, bedsByStageTissue, trackDir): write_bed_file(allBeds, allModelsFile(trackDir, '.bed.gz')) for sid, beds in bedsBySid.items(): write_bed_file(beds, osp.join(trackDir, sampleIdToBedPath(clsMetaBySid[sid]))) for stageTissue, beds in bedsByStageTissue.items(): write_bed_file(beds, osp.join(trackDir, stageTissueToBedPath(stageTissue))) -def makeBeds(opts, clsMetadataTab, masterBedFile, transcriptMetaTsv, trackDir): +def makeBeds(opts, clsMetadataTab, masterBedFile, transcriptMetaTsv, clsGencodeAssocTsv, trackDir): clsMetaBySid = loadClsMetadata(clsMetadataTab) transcriptMetas = loadTranscriptMeta(transcriptMetaTsv) - allBeds, bedsBySid, bedsByStageTissue = buildBeds(masterBedFile, clsMetaBySid, transcriptMetas) + clsGencodeAssoc = loadClsGencodeAssoc(clsGencodeAssocTsv) + allBeds, bedsBySid, bedsByStageTissue = buildBeds(masterBedFile, clsMetaBySid, transcriptMetas, clsGencodeAssoc) write_bed_files(clsMetaBySid, allBeds, bedsBySid, bedsByStageTissue, trackDir) ## # trackDb generate ## COMPOSITE_TDB = """\ track clsLongReadRnaTrack {parent}\ compositeTrack on shortLabel CLS long-read RNAs longLabel Capture long-seq long-read lncRNAs type bigBed 12 visibility pack subGroup1 view Views \\ targets_view=Targets \\ @@ -377,55 +451,57 @@ track models_view view models_view parent clsLongReadRnaTrack shortLabel Models visibility pack type bigBed noScoreFilter on track cls_gene_models parent models_view on subGroups view=models_view sample=combined type=models shortLabel CLS transcripts longLabel CLS transcript models type bigBed 12 + bigDataUrl {bigDataUrl} - color 12,12,120 visibility pack priority 2 + itemRgb on + # bigBed filters don't work with composites + #filterType.inGencode singleList + #filterValues.inGencode yes,no """ SAMPLE_MODELS_VIEW_TDB = """\ track sample_models_view view sample_models_view parent clsLongReadRnaTrack shortLabel Sample models visibility squish type bigBed noScoreFilter on """ SAMPLE_MODELS_TDB = """\ track {sample}_models parent sample_models_view on shortLabel {shortLabel} longLabel {longLabel} bigDataUrl {bigDataUrl} type bigBed 12 + - color 100,22,180 visibility squish subGroups view=sample_models_view sample={sample} type=models """ EXPR_READS_VIEW_TDB = """\ track per_expr_reads_view view per_expr_reads_view parent clsLongReadRnaTrack on shortLabel Reads visibility squish type bam """ @@ -448,33 +524,36 @@ shortLabel Models visibility dense type bigBed noScoreFilter on """ EXPR_MODELS_TDB = """\ track {track} parent per_expr_models_view off shortLabel {shortLabel} longLabel {longLabel} bigDataUrl {bigDataUrl} type bigBed 12 + noScoreFilter on - color 100,22,180 visibility hide subGroups view=per_expr_models_view sample={sample} type={type} + itemRgb on + # bigBed filters don't work with composites + #filterType.inGencode singleList + #filterValues.inGencode yes,no """ def sampleGrpName(clsMeta): return clsMeta.stage.lower() + "_" + clsMeta.tissue.lower() def buildSampleSubGroupDef(clsMeta): name = sampleGrpName(clsMeta) desc = clsMetaToSampleDesc(clsMeta).replace(' ', '_') return f" {name}={desc}" def buildSampleSubGroupDefs(clsMetas): defs = [] done = set() for clsMeta in clsMetas: @@ -602,24 +681,24 @@ def checkBams(opts, clsMetadataTab, trackDir): clsMetaBySid = loadClsMetadata(clsMetadataTab) bams = listBams(trackDir, "pre-capture") | listBams(trackDir, "post-capture") errCnt, doneBams = checkSamplesWithBams(clsMetaBySid, trackDir, bams) errCnt += checkReferencedBams(bams, doneBams) if errCnt > 0: print(f"{errCnt} errors detected", file=sys.stderr) exit(1) print(f"{len(doneBams)} BAMs validated", file=sys.stderr) def main(): opts, args = parseArgs() if args.command == 'make-beds': - makeBeds(opts, args.clsMetadataTab, args.masterBedFile, args.transcriptMetaTsv, args.trackDir) + makeBeds(opts, args.clsMetadataTab, args.masterBedFile, args.transcriptMetaTsv, args.clsGencodeAssocTsv, args.trackDir) elif args.command == 'make-trackdb': makeTrackDb(opts, args.clsMetadataTab, args.trackDir, args.trackDb, args.parent) elif args.command == 'check-bams': checkBams(opts, args.clsMetadataTab, args.trackDir) else: assert False, f"subcommand not handled: {args.command}" main()