2373cd564a862f870831fc66bb4687e422848ffa
markd
  Sat Jun 7 14:11:06 2025 -0700
Import of CLS Long Read Data from GENCODE / CRG (aka what I did in Barcelona)

diff --git src/hg/makeDb/doc/mm10.clsLongReadRna.txt src/hg/makeDb/doc/mm10.clsLongReadRna.txt
new file mode 100644
index 00000000000..cb8dd85a23f
--- /dev/null
+++ src/hg/makeDb/doc/mm10.clsLongReadRna.txt
@@ -0,0 +1,66 @@
+####
+# 2025-06-07 CLS long-read RNA  (Mark Diekhans <markd@ucsc.edu>)
+#
+# See also hg38/clsLongReadRna.txt
+####
+
+clsSrc=$HOME/compbio/browser/dev/kent/src/hg/makeDb/outside/clsLongReadRna
+clsTrackTool=${clsSrc}/clsTrackTool
+clsModelBedAs=${clsSrc}/clsModelBed.as 
+clsTargetBedAs=${clsSrc}/clsTargetBed.as
+gff_attrs_extract=${clsSrc}/gff-attrs-extract.awk
+
+cd /hive/data/genomes/mm10/bed/clsLongReadRna
+
+# Obtained human and mouse BAMs and indexes
+  https://public-docs.crg.es/rguigo/Data/scarbonell/gencode_paper2024_figures/browser_tracks/hub/path2LyRicBAMs.tsv 
+
+  copy BAMs directly from CRG via my account, split BAMs into pre-capture and
+  post-capture directories
+
+
+# download LyRic models and metadata:
+  metadata documented here:
+  https://github.com/guigolab/gencode-cls-master-table
+  cd lyric-data
+  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/latest/download/Mv2_masterTable_refined.gtf.gz
+  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/GencodeCLS_v1.0/Mv2_metadata.tsv.gz
+  wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/Supplementary/Mv2_CLS3_targetDesign_mergedRegions.gtf.gz
+
+# check BAMs vs metadata
+  $clsTrackTool check-bams lyric-data/Mv2_metadata.tsv.gz .
+
+# process GTF
+  # convert to BED via GTF -> GFF3 -> genePred -> BED
+  # need to go via GFF3 to get attributes in a file
+  cd lyric-data/
+  zcat Mv2_masterTable_refined.gtf.gz | gffread -F | gff3ToGenePred -attrsOut=Mv2_masterTable_refined.attrs.tab stdin stdout |\
+    genePredToBed stdin stdout | tawk '{$7=$2; print}' | sort -k1,1 -k2,2n |gzip -9 > Mv2_masterTable_refined.bed.gz 
+
+  # extract tags of interest to  TSV
+  tawk -f  $gff_attrs_extract Mv2_masterTable_refined.attrs.tab |pigz -c  >Mv2_transcript-meta.tsv.gz
+  rm Mv2_masterTable_refined.attrs.tab
+
+  # build target BEDs (discards ERCC-0* entries)
+  zcat Mv2_CLS3_targetDesign_mergedRegions.gtf.gz | ../../target-gff-to-bed | pigz -c  >../cls-targets.bed.gz
+
+# make model BEDs
+  one for each sample+phase+platform,
+  one for each sample
+  one with all
+
+  $clsTrackTool make-beds lyric-data/Mv2_metadata.tsv.gz lyric-data/Mv2_masterTable_refined.bed.gz lyric-data/Mv2_transcript-meta.tsv.gz . &
+  bedToBigBed -tab -type=bed6+2 -as=$clsTargetBedAs -sizesIsChromAliasBb cls-targets.bed.gz /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb cls-targets.bb
+  for bed in cls-models*.bed.gz cls-models.bed.gz *capture/*.bed.gz ; do
+     bedToBigBed -tab -type=bed12+3 -as=$clsModelBedAs -sizesIsChromAliasBb $bed /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb $(dirname $bed)/$(basename $bed .bed.gz).bb
+  done
+
+# make trackDb
+  $clsTrackTool make-trackdb --parent=long_read_transcripts lyric-data/Mv2_metadata.tsv.gz /gbdb/mm10/clsLongReadRna ~/kent/src/hg/makeDb/trackDb/mouse/mm10/clsLongReadRna.ra
+
+  # edit to make member of long_read_transcripts.ra:
+    include clsLongReadRna.ra alpha
+
+# link to gbdb
+  mkdir -p /gbdb/mm10/clsLongReadRna/{post-capture,pre-capture}
+  for bb in $(find . -name '*.bb' -o  -name '*.bam*') ; do ln -sf $(realpath $bb) /gbdb/mm10/clsLongReadRna/$bb ; done