994049f75b593264f0ff047f88903468bf1c2031 jeltje.van.baren Fri Apr 4 13:35:35 2025 -0700 mm10 and hg38 encode4 diff --git src/hg/makeDb/doc/mm10.txt src/hg/makeDb/doc/mm10.txt index af6e055ba93..62e197a744f 100644 --- src/hg/makeDb/doc/mm10.txt +++ src/hg/makeDb/doc/mm10.txt @@ -19472,15 +19472,61 @@ \ -target2bit="/hive/data/genomes/asmHubs/GCF/016/772/045/GCF_016772045.2/GCF_016772045.2.2bit" \ -targetSizes="/hive/data/genomes/asmHubs/GCF/016/772/045/GCF_016772045.2/GCF_016772045.2.chrom.sizes.txt" \ GCF_016772045.2 mm10) > rbest.log 2>&1 grep -w real rbest.log | sed -e 's/^/ # /;' # real 190m23.152s sed -e 's/^/ # /;' fb.GCF_016772045.2.chainRBest.Mm10.txt # 658848325 bases of 2654063983 (24.824%) in intersection real 663m45.533s user 0m3.053s sys 0m2.728s ############################################################################## +############################################################################## +# ENCODE4 triplets - Jeltje April 2025 + +mkdir -p april2025 +#wget -O april2025/mouse_ucsc_transcripts.gtf "https://zenodo.org/records/15116042/files/mouse_ucsc_transcripts.gtf?download=1" +#wget -O april2025/filt_ab_tpm_mouse.tsv "https://zenodo.org/records/15116042/files/filt_ab_tpm_mouse.tsv?download=1" +#wget -O april2025/mouse_protein_summary.tsv "https://zenodo.org/records/15116042/files/mouse_protein_summary.tsv?download=1" +#wget -O april2025/mouse_sample_info.tsv https://zenodo.org/records/15116042/files/lr_mouse_library_data_summary.tsv?download=1 + +gtfFile='april2025/mouse_ucsc_transcripts.gtf' +quantFile='april2025/filt_ab_tpm_mouse.tsv' # really counts per million since every read is full length +protFile='april2025/mouse_protein_summary.tsv' +sampleFile='april2025/mouse_sample_info.tsv' +# this outputs bed12 + extra ID fields, topval expressions for mouseover and an expression html table +./gtfToBed.py $gtfFile $quantFile $protFile $sampleFile transcripts.bed > missing.ids +bedSort transcripts.bed transcripts.bed + +cat << '_EOF_' > encode4.as +table encode4 +"Bed 12+8 file with annotation source and values per sample in a html table." + ( + string chrom; "Chromosome (or contig, scaffold, etc.)" + uint chromStart; "Start position in chromosome" + uint chromEnd; "End position in chromosome" + string name; "Name of item" + uint score; "Score from 0-1000" + char[1] strand; "+ or -" + uint thickStart; "Start of where display should be thick (start codon)" + uint thickEnd; "End of where display should be thick (stop codon)" + uint reserved; "Used as itemRgb as of 2004-11-22" + int blockCount; "Number of blocks" + int[blockCount] blockSizes; "Comma separated list of block sizes" + int[blockCount] chromStarts; "Start positions relative to chromStart" + string source; "Annotation source" + string gene_id; "gene ID" + string gene_name; "gene name" + string transcript_id; "transcript ID" + string transcript_name; "transcript name" + float maxScore; "Highest expression score (counts per million)" + lstring maxScoreHtml; "Highest expression score and sample(s)" + lstring expr_table; "Expression values per sample in TPM" + ) +_EOF_ + +bedToBigBed -type=bed12+8 -as=encode4.as -tab transcripts.bed /hive/data/genomes/mm10/chrom.sizes encode4.bb +