66604eefdbb0971ed43ee81bd6f18cc3a409e304 markd Sat Jan 10 13:18:44 2026 -0800 color-code clsLongReadRna modles (#36908) diff --git src/hg/makeDb/doc/mm10.clsLongReadRna.txt src/hg/makeDb/doc/mm10.clsLongReadRna.txt index cb8dd85a23f..8dab04f1c41 100644 --- src/hg/makeDb/doc/mm10.clsLongReadRna.txt +++ src/hg/makeDb/doc/mm10.clsLongReadRna.txt @@ -1,66 +1,90 @@ #### # 2025-06-07 CLS long-read RNA (Mark Diekhans <markd@ucsc.edu>) # # See also hg38/clsLongReadRna.txt #### clsSrc=$HOME/compbio/browser/dev/kent/src/hg/makeDb/outside/clsLongReadRna clsTrackTool=${clsSrc}/clsTrackTool clsModelBedAs=${clsSrc}/clsModelBed.as clsTargetBedAs=${clsSrc}/clsTargetBed.as gff_attrs_extract=${clsSrc}/gff-attrs-extract.awk cd /hive/data/genomes/mm10/bed/clsLongReadRna # Obtained human and mouse BAMs and indexes https://public-docs.crg.es/rguigo/Data/scarbonell/gencode_paper2024_figures/browser_tracks/hub/path2LyRicBAMs.tsv copy BAMs directly from CRG via my account, split BAMs into pre-capture and post-capture directories # download LyRic models and metadata: metadata documented here: https://github.com/guigolab/gencode-cls-master-table cd lyric-data wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/latest/download/Mv2_masterTable_refined.gtf.gz wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/GencodeCLS_v1.0/Mv2_metadata.tsv.gz wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/Supplementary/Mv2_CLS3_targetDesign_mergedRegions.gtf.gz # check BAMs vs metadata $clsTrackTool check-bams lyric-data/Mv2_metadata.tsv.gz . # process GTF # convert to BED via GTF -> GFF3 -> genePred -> BED # need to go via GFF3 to get attributes in a file cd lyric-data/ zcat Mv2_masterTable_refined.gtf.gz | gffread -F | gff3ToGenePred -attrsOut=Mv2_masterTable_refined.attrs.tab stdin stdout |\ genePredToBed stdin stdout | tawk '{$7=$2; print}' | sort -k1,1 -k2,2n |gzip -9 > Mv2_masterTable_refined.bed.gz # extract tags of interest to TSV tawk -f $gff_attrs_extract Mv2_masterTable_refined.attrs.tab |pigz -c >Mv2_transcript-meta.tsv.gz rm Mv2_masterTable_refined.attrs.tab # build target BEDs (discards ERCC-0* entries) zcat Mv2_CLS3_targetDesign_mergedRegions.gtf.gz | ../../target-gff-to-bed | pigz -c >../cls-targets.bed.gz # make model BEDs one for each sample+phase+platform, one for each sample one with all $clsTrackTool make-beds lyric-data/Mv2_metadata.tsv.gz lyric-data/Mv2_masterTable_refined.bed.gz lyric-data/Mv2_transcript-meta.tsv.gz . & bedToBigBed -tab -type=bed6+2 -as=$clsTargetBedAs -sizesIsChromAliasBb cls-targets.bed.gz /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb cls-targets.bb for bed in cls-models*.bed.gz cls-models.bed.gz *capture/*.bed.gz ; do bedToBigBed -tab -type=bed12+3 -as=$clsModelBedAs -sizesIsChromAliasBb $bed /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb $(dirname $bed)/$(basename $bed .bed.gz).bb done # make trackDb $clsTrackTool make-trackdb --parent=long_read_transcripts lyric-data/Mv2_metadata.tsv.gz /gbdb/mm10/clsLongReadRna ~/kent/src/hg/makeDb/trackDb/mouse/mm10/clsLongReadRna.ra # edit to make member of long_read_transcripts.ra: include clsLongReadRna.ra alpha # link to gbdb mkdir -p /gbdb/mm10/clsLongReadRna/{post-capture,pre-capture} for bb in $(find . -name '*.bb' -o -name '*.bam*') ; do ln -sf $(realpath $bb) /gbdb/mm10/clsLongReadRna/$bb ; done + +# 2026-01-09 markd + Based on paper review feedback, adding color coding and filters to indicate if + transcript model was added to GENCODE and if so, indicate the assigned transcript + type. + + # obtain mappings + https://github.com/guigolab/cls3-final-files-description?tab=readme-ov-file#v47-CLS3-Mappings-detailed + + cd lyric-data + wget -nv https://public-docs.crg.es/rguigo/Data/gkaur/CLS3_finalFiles/v47-CLS3mapping_status.txthttps://public-docs.crg.es/rguigo/Data/gkaur/CLS3_finalFiles/vM36-CLS3mapping_status.txt + pigz vM36-CLS3mapping_status.txt + cd .. + + # generate BEDS + $clsTrackTool make-beds lyric-data/Mv2_metadata.tsv.gz lyric-data/Mv2_masterTable_refined.bed.gz lyric-data/Mv2_transcript-meta.tsv.gz lyric-data/vM36-CLS3mapping_status.txt.gz . & + Note: 539 or 282296 models are mapped to multiple GENCODE transcripts + + for bed in cls-models*.bed.gz cls-models.bed.gz *capture/*.bed.gz ; do + (bedToBigBed -tab -type=bed12+3 -as=$clsModelBedAs -sizesIsChromAliasBb $bed /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb $(dirname $bed)/$(basename $bed .bed.gz).bb &) + done + + # make trackDb + $clsTrackTool make-trackdb --parent=long_read_transcripts lyric-data/Mv2_metadata.tsv.gz /gbdb/mm10/clsLongReadRna ~/kent/src/hg/makeDb/trackDb/mouse/mm10/clsLongReadRna.ra