2373cd564a862f870831fc66bb4687e422848ffa markd Sat Jun 7 14:11:06 2025 -0700 Import of CLS Long Read Data from GENCODE / CRG (aka what I did in Barcelona) diff --git src/hg/makeDb/doc/mm10.clsLongReadRna.txt src/hg/makeDb/doc/mm10.clsLongReadRna.txt new file mode 100644 index 00000000000..cb8dd85a23f --- /dev/null +++ src/hg/makeDb/doc/mm10.clsLongReadRna.txt @@ -0,0 +1,66 @@ +#### +# 2025-06-07 CLS long-read RNA (Mark Diekhans ) +# +# See also hg38/clsLongReadRna.txt +#### + +clsSrc=$HOME/compbio/browser/dev/kent/src/hg/makeDb/outside/clsLongReadRna +clsTrackTool=${clsSrc}/clsTrackTool +clsModelBedAs=${clsSrc}/clsModelBed.as +clsTargetBedAs=${clsSrc}/clsTargetBed.as +gff_attrs_extract=${clsSrc}/gff-attrs-extract.awk + +cd /hive/data/genomes/mm10/bed/clsLongReadRna + +# Obtained human and mouse BAMs and indexes + https://public-docs.crg.es/rguigo/Data/scarbonell/gencode_paper2024_figures/browser_tracks/hub/path2LyRicBAMs.tsv + + copy BAMs directly from CRG via my account, split BAMs into pre-capture and + post-capture directories + + +# download LyRic models and metadata: + metadata documented here: + https://github.com/guigolab/gencode-cls-master-table + cd lyric-data + wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/latest/download/Mv2_masterTable_refined.gtf.gz + wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/GencodeCLS_v1.0/Mv2_metadata.tsv.gz + wget -nv https://github.com/guigolab/gencode-cls-master-table/releases/download/Supplementary/Mv2_CLS3_targetDesign_mergedRegions.gtf.gz + +# check BAMs vs metadata + $clsTrackTool check-bams lyric-data/Mv2_metadata.tsv.gz . + +# process GTF + # convert to BED via GTF -> GFF3 -> genePred -> BED + # need to go via GFF3 to get attributes in a file + cd lyric-data/ + zcat Mv2_masterTable_refined.gtf.gz | gffread -F | gff3ToGenePred -attrsOut=Mv2_masterTable_refined.attrs.tab stdin stdout |\ + genePredToBed stdin stdout | tawk '{$7=$2; print}' | sort -k1,1 -k2,2n |gzip -9 > Mv2_masterTable_refined.bed.gz + + # extract tags of interest to TSV + tawk -f $gff_attrs_extract Mv2_masterTable_refined.attrs.tab |pigz -c >Mv2_transcript-meta.tsv.gz + rm Mv2_masterTable_refined.attrs.tab + + # build target BEDs (discards ERCC-0* entries) + zcat Mv2_CLS3_targetDesign_mergedRegions.gtf.gz | ../../target-gff-to-bed | pigz -c >../cls-targets.bed.gz + +# make model BEDs + one for each sample+phase+platform, + one for each sample + one with all + + $clsTrackTool make-beds lyric-data/Mv2_metadata.tsv.gz lyric-data/Mv2_masterTable_refined.bed.gz lyric-data/Mv2_transcript-meta.tsv.gz . & + bedToBigBed -tab -type=bed6+2 -as=$clsTargetBedAs -sizesIsChromAliasBb cls-targets.bed.gz /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb cls-targets.bb + for bed in cls-models*.bed.gz cls-models.bed.gz *capture/*.bed.gz ; do + bedToBigBed -tab -type=bed12+3 -as=$clsModelBedAs -sizesIsChromAliasBb $bed /hive/data/genomes/mm10/bed/chromAlias/p6/mm10.chromAlias.bb $(dirname $bed)/$(basename $bed .bed.gz).bb + done + +# make trackDb + $clsTrackTool make-trackdb --parent=long_read_transcripts lyric-data/Mv2_metadata.tsv.gz /gbdb/mm10/clsLongReadRna ~/kent/src/hg/makeDb/trackDb/mouse/mm10/clsLongReadRna.ra + + # edit to make member of long_read_transcripts.ra: + include clsLongReadRna.ra alpha + +# link to gbdb + mkdir -p /gbdb/mm10/clsLongReadRna/{post-capture,pre-capture} + for bb in $(find . -name '*.bb' -o -name '*.bam*') ; do ln -sf $(realpath $bb) /gbdb/mm10/clsLongReadRna/$bb ; done