a54f86a21a62394f88021eefccba67f18a4a27e6
max
  Thu Apr 30 05:45:35 2026 -0700
Adding new "DRACH motif sites" track under rnaMod superTrack on hg38: every occurrence of the m6A consensus motif (DRACH) in MANE Select v1.5 transcripts, projected onto the genome via pslMap. New scripts under makeDb/scripts/rnaMod, makeDoc under makeDb/doc/hg38/rnaMod.txt. Refreshed the parent rnaMod.html. Track is alpha-only (rnaMod.ra is already alpha-included), refs #36613

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

diff --git src/hg/makeDb/scripts/rnaMod/makeDrach.sh src/hg/makeDb/scripts/rnaMod/makeDrach.sh
new file mode 100755
index 00000000000..f116c2bbb7a
--- /dev/null
+++ src/hg/makeDb/scripts/rnaMod/makeDrach.sh
@@ -0,0 +1,91 @@
+#!/bin/bash
+# Build the DRACH (m6A consensus) motif track for hg38.
+#
+# Pipeline:
+#   1. fetch MANE Ensembl rna fasta + genomic GTF (release 1.5)
+#   2. scan transcripts for [AGT][AG]AC[ACT] -> transcript-coord BED
+#   3. lift via bedToPsl + pslMap through the MANE transcript->genome PSL
+#   4. decorate to bigBed 12+5 with motif/gene/region metadata
+#
+# Run from the data directory (default /hive/data/genomes/hg38/bed/rnaMod/drach).
+
+set -euo pipefail
+
+DATADIR="${DATADIR:-/hive/data/genomes/hg38/bed/rnaMod/drach}"
+SCRIPTS="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+CHROMSIZES="/hive/data/genomes/hg38/chrom.sizes"
+MANE_BASE="https://ftp.ncbi.nlm.nih.gov/refseq/MANE/MANE_human/release_1.5"
+RNA_FA="MANE.GRCh38.v1.5.ensembl_rna.fna.gz"
+GTF="MANE.GRCh38.v1.5.ensembl_genomic.gtf.gz"
+
+mkdir -p "$DATADIR"
+cd "$DATADIR"
+
+echo "[$(date +%T)] working in $DATADIR"
+
+# 1. download
+if [ ! -s "$RNA_FA" ]; then
+    echo "[$(date +%T)] curl $RNA_FA"
+    curl -sSf -o "$RNA_FA" "$MANE_BASE/$RNA_FA"
+fi
+if [ ! -s "$GTF" ]; then
+    echo "[$(date +%T)] curl $GTF"
+    curl -sSf -o "$GTF" "$MANE_BASE/$GTF"
+fi
+
+# 2. scan for DRACH in transcript coords
+echo "[$(date +%T)] scan DRACH motifs"
+python3 "$SCRIPTS/drachFromFasta.py" "$RNA_FA" \
+    drach.tx.bed tx.sizes tx2gene.tsv
+
+# 3a. transcripts -> genePred -> transcript->genome PSL
+echo "[$(date +%T)] gtfToGenePred"
+gunzip -c "$GTF" > MANE.gtf
+gtfToGenePred -genePredExt MANE.gtf MANE.gp
+echo "[$(date +%T)] genePredToPsl"
+genePredToPsl "$CHROMSIZES" MANE.gp MANE.tx2genome.psl
+
+# 3b. motif BED -> motif PSL (in transcript space)
+echo "[$(date +%T)] bedToPsl (motifs in transcript space)"
+bedToPsl tx.sizes drach.tx.bed drach.tx.psl
+
+# 3c. project motifs onto the genome
+echo "[$(date +%T)] pslMap"
+pslMap -mapInfo=drach.mapInfo drach.tx.psl MANE.tx2genome.psl drach.genome.psl
+
+# 4a. PSL -> BED12 (preserves multi-block features at splice junctions)
+echo "[$(date +%T)] pslToBed"
+pslToBed drach.genome.psl drach.bed12.tmp
+
+# 4b. decorate to bed12+5
+echo "[$(date +%T)] decorate to bed12+5"
+python3 "$SCRIPTS/drachBedToBigBed.py" \
+    drach.bed12.tmp MANE.gp tx2gene.tsv > drach.bed
+
+# 4c. sort + bigBed
+echo "[$(date +%T)] sort + bedToBigBed"
+sort -k1,1 -k2,2n drach.bed > drach.sorted.bed
+bedToBigBed -tab -type=bed12+5 -as="$SCRIPTS/drach.as" \
+    drach.sorted.bed "$CHROMSIZES" drach.bb
+
+# 5. summary report
+echo
+echo "===== DRACH track build summary ====="
+n_tx=$(wc -l < tx.sizes)
+n_motif_in=$(wc -l < drach.tx.bed)
+n_motif_mapped=$(wc -l < drach.bed12.tmp)
+n_final=$(wc -l < drach.sorted.bed)
+n_multi=$(awk -F'\t' '$10>1' drach.sorted.bed | wc -l)
+echo "MANE transcripts processed:           $n_tx"
+echo "DRACH motifs found (transcript):      $n_motif_in"
+echo "Motifs mapped to genome (pslToBed):   $n_motif_mapped"
+echo "Final features in drach.bb:           $n_final"
+echo "Multi-block (splice junction):        $n_multi"
+if [ "$n_motif_in" != "$n_motif_mapped" ]; then
+    dropped=$((n_motif_in - n_motif_mapped))
+    echo "WARN: $dropped motifs were not mapped to the genome." >&2
+    echo "      See drach.mapInfo for details." >&2
+fi
+echo
+
+echo "[$(date +%T)] done. Output: $DATADIR/drach.bb"