a54f86a21a62394f88021eefccba67f18a4a27e6 max Thu Apr 30 05:45:35 2026 -0700 Adding new "DRACH motif sites" track under rnaMod superTrack on hg38: every occurrence of the m6A consensus motif (DRACH) in MANE Select v1.5 transcripts, projected onto the genome via pslMap. New scripts under makeDb/scripts/rnaMod, makeDoc under makeDb/doc/hg38/rnaMod.txt. Refreshed the parent rnaMod.html. Track is alpha-only (rnaMod.ra is already alpha-included), refs #36613 Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> diff --git src/hg/makeDb/scripts/rnaMod/drachBedToBigBed.py src/hg/makeDb/scripts/rnaMod/drachBedToBigBed.py new file mode 100755 index 00000000000..7caf490ed60 --- /dev/null +++ src/hg/makeDb/scripts/rnaMod/drachBedToBigBed.py @@ -0,0 +1,149 @@ +#!/usr/bin/env python3 +"""Decorate genome-mapped DRACH BED12 with motif/gene/region metadata. + +Inputs: + bed12 BED12 produced by `pslToBed`. The `name` column is + 'tx|txPos|motif' (forward-engineered by drachFromFasta.py). + genePred MANE genePred (from gtfToGenePred). Used to compute CDS start/end + in transcript coordinates and the transcript length, so we can + classify each motif as 5'UTR/CDS/3'UTR. + tx2gene.tsv tx<tab>gene_symbol from drachFromFasta.py + +Output: BED 12+5 on stdout. Fields (tab-separated): + chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb + blockCount blockSizes chromStarts motif gene transcript txPos region + +`name` is left empty per project convention. score=0. itemRgb assigned per motif +sub-class. +""" +import sys + +# Color palette: highlight the four most-common DRACH variants reported in m6A +# consensus surveys; everything else gets a neutral gray. Flat solid colors. +MOTIF_COLOR = { + 'GGACT': '220,20,60', # crimson + 'GGACA': '30,144,255', # dodger blue + 'GAACT': '255,140,0', # dark orange + 'AGACT': '60,179,113', # medium sea green +} +DEFAULT_COLOR = '128,128,128' + + +def read_tx2gene(path): + out = {} + with open(path) as fh: + for line in fh: + tx, sym = line.rstrip('\n').split('\t', 1) + out[tx] = sym + return out + + +def read_gp_cds(path): + """Return {tx: (txLen, cdsStartTx, cdsEndTx)} from a genePred file. + + cdsStartTx/cdsEndTx are None for non-coding transcripts. + """ + out = {} + with open(path) as fh: + for line in fh: + f = line.rstrip('\n').split('\t') + if len(f) < 10: + continue + tx = f[0] + strand = f[2] + cdsStart = int(f[5]) + cdsEnd = int(f[6]) + exonCount = int(f[7]) + exonStarts = [int(x) for x in f[8].rstrip(',').split(',')[:exonCount]] + exonEnds = [int(x) for x in f[9].rstrip(',').split(',')[:exonCount]] + exons = sorted(zip(exonStarts, exonEnds)) + txLen = sum(e - s for s, e in exons) + + def g2tx(g): + cum = 0 + for s, e in exons: + if s <= g < e: + plus_tx = cum + (g - s) + if strand == '+': + return plus_tx + return txLen - 1 - plus_tx + cum += e - s + return None + + if cdsStart == cdsEnd: + cds_s_tx = cds_e_tx = None + elif strand == '+': + cds_s_tx = g2tx(cdsStart) + last = g2tx(cdsEnd - 1) + cds_e_tx = None if last is None else last + 1 + else: + cds_s_tx = g2tx(cdsEnd - 1) + last = g2tx(cdsStart) + cds_e_tx = None if last is None else last + 1 + out[tx] = (txLen, cds_s_tx, cds_e_tx) + return out + + +def classify(txPos, cds_s, cds_e): + if cds_s is None: + return 'ncRNA' + if txPos < cds_s: + return "5'UTR" + if txPos < cds_e: + return 'CDS' + return "3'UTR" + + +def main(): + if len(sys.argv) < 4: + sys.stderr.write( + 'usage: drachBedToBigBed.py <bed12> <genePred> <tx2gene.tsv>\n') + sys.exit(1) + bed_in, gp_in, t2g_in = sys.argv[1:4] + + tx2gene = read_tx2gene(t2g_in) + tx_cds = read_gp_cds(gp_in) + + n_in = 0 + n_out = 0 + n_multi_block = 0 + n_missing = 0 + out = sys.stdout + with open(bed_in) as fh: + for line in fh: + f = line.rstrip('\n').split('\t') + if len(f) < 12: + continue + n_in += 1 + chrom, cs, ce, name, score, strand, ts, te, rgb, bc, bs, bst = f[:12] + try: + tx, txPos_s, motif = name.split('|') + except ValueError: + n_missing += 1 + continue + txPos = int(txPos_s) + gene = tx2gene.get(tx, '') + tx_info = tx_cds.get(tx) + if tx_info is None: + region = 'unknown' + else: + _, cds_s, cds_e = tx_info + region = classify(txPos, cds_s, cds_e) + color = MOTIF_COLOR.get(motif, DEFAULT_COLOR) + if int(bc) > 1: + n_multi_block += 1 + out.write('\t'.join([ + chrom, cs, ce, '', '0', strand, cs, ce, color, + bc, bs, bst, + motif, gene, tx, str(txPos), region, + ]) + '\n') + n_out += 1 + + sys.stderr.write(f'input rows: {n_in}\n') + sys.stderr.write(f'output rows: {n_out}\n') + sys.stderr.write(f'multi-block (junction-spanning): {n_multi_block}\n') + sys.stderr.write(f'name-parse failures: {n_missing}\n') + + +if __name__ == '__main__': + main()