0b5b8713618f911dd69626d3e27df3f97918c28c markd Mon Jun 7 22:27:02 2021 -0700 added notes on how 241-mammalian-2020v2.ucsc.bigMaf was built diff --git src/hg/makeDb/doc/hg38/cactus241way.txt src/hg/makeDb/doc/hg38/cactus241way.txt index 8c2344e..e731ec3 100644 --- src/hg/makeDb/doc/hg38/cactus241way.txt +++ src/hg/makeDb/doc/hg38/cactus241way.txt @@ -1,210 +1,226 @@ ######################################################################### # Zoonomia phyloP for 241-way from (2021-04-24 MarkD) cd /hive/data/genomes/hg38/bed/cactus241way/phyloP # Obtain phylogP scores computed by Michael Dong (Uppsala U). Since Broad # has had their ftp and http sharing of files turned off, they were placed # in google drive (LoL) # download from https://drive.google.com/drive/u/0/folders/1Xc215oxu0cvBZOarxFv2L3HAjPmiZ-0A # to directory uppsala # format is, so convert with awk chr1 10074 10075 id-1 0.053000 chr1 10075 10076 id-2 0.064000 chr1 10076 10077 id-3 0.064000 # convert to wig and wib zcat uppsala/chr*.bed.gz | tawk '{print $1,$2,$3,$5}' | wigEncode stdin phyloP241way.wig phyloP241way.wib ln -s $(pwd)/phyloP241way.wib /gbdb/hg38/cactus241way/ hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus241way hg38 phyloP241way phyloP241way.wig ############################################################################ ## adding frames ( DONE - 2021-04-29 - Hiram ) mkdir /hive/data/genomes/hg38/bed/cactus241way/frames cd /hive/data/genomes/hg38/bed/cactus241way/frames mkdir genes hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/hg38.gp.gz genePredCheck -db=hg38 genes/hg38.gp.gz # checked: 19328 failed: 0 ls ../ucscNames | sed -e 's/.maf//;' > chr.list ls genes | sed -e 's/.gp.gz//;' > gene.list printf '#!/bin/bash set -beEu -o pipefail export C=$1 export G=$2 cat ../ucscNames/${C}.maf | genePredToMafFrames hg38 stdin stdout \ "${G}" genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz ' > runOne chmod +x rnOne printf '#LOOP ./runOne $(root1) $(root2) parts/$(root1).$(root2).mafFrames.gz #ENDLOOP ' > template gensub2 chr.list gene.list template perl.jobList time ($HOME/kent/src/hg/utils/automation/perlPara.pl 4 perl.jobList) \ >> do.log 2>&1 & tail do.log # Completed: 454 of 454 jobs # CPU time in finished jobs: 91822s 1530.37m 25.51h 1.06d 0.003y # Average job time: 202s 3.37m 0.06h 0.00d # Longest finished job: 7570s 126.17m 2.10h 0.09d # real 417m27.701s time find ./parts -type f | while read F do echo "${F}" 1>&2 zcat ${F} done | sort -k1,1 -k2,2n | gzip -c > cactus241wayFrames.bed.gz # real 0m3.178s hgLoadMafFrames hg38 cactus241wayFrames cactus241wayFrames.bed.gz featureBits -countGaps hg38 cactus241wayFrames # 33621579 bases of 3272116950 (1.028%) in intersection ############################################################################ ############################################################################# # construct download files for 241-way (TBD - 2015-04-15 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way mkdir /hive/data/genomes/hg38/bed/cactus241way/downloads cd /hive/data/genomes/hg38/bed/cactus241way/downloads mkdir maf for F in `ls ../ucscNames` do gzip -c ../ucscNames/$F > maf/${F}.gz done # running 10 of those simultaneously in an hgwdev perlPara.pl job: # real 537m54.983s time md5sum *.maf.gz > md5sum.txt real 46m43.423s user 39m29.707s sys 6m41.942s ln -s ../cactus241way.nh hg38.cactus241way.nh ln -s ../before.nameChange.nh hg38.cactus241way.scientificNames.nh # convert the phyloP bed files to bedGraph file: zcat ../phyloP/uppsala/*.bed.gz \ | awk '{printf "%s\t%d\t%d\t%s\n", $1,$2,$3,$5}' \ | sort -k1,1 -k2,2n > phyloP.bedGraph # Convert the bedGraph phyloP data to wigFixedStep via a perl script: #!/usr/bin/env perl use strict; use warnings; my $chr = ""; my $start = 0; my $prevStart = 0; open (FH, ") { chomp $line; my ($name, $chrStart, $chrEnd, $value) = split('\s+', $line); if ( ($name ne $chr) || (($chrStart - $prevStart) > 1) ) { printf "fixedStep chrom=%s start=%d step=1 span=1\n", $name, $chrStart+1; } printf "%s\n", $value; $prevStart = $chrStart; $chr = $name; } close (FH); time (./phyloP.sh > phyloP.bedGraph) >> phyloP.log 2>&1 # real 232m54.489s # verify the wigEncode and wigToBigWig have the same result: wigEncode cactus241way.phyloP.wigFix.gz hg38.cactus241way.wig \ hg38.cactus241way.wib # Converted cactus241way.phyloP.wigFix.gz, upper limit 9.28, # lower limit -20.00 # -rw-rw-r-- 1 2852623265 May 10 12:07 hg38.cactus241way.wib # -rw-rw-r-- 1 307726912 May 10 12:07 hg38.cactus241way.wig wigToBigWig cactus241way.phyloP.wigFix.gz ../../../chrom.sizes \ cactus241way.phyloP.bw # -rw-rw-r-- 1 9644660543 May 10 12:28 cactus241way.phyloP.bw bigWigInfo cactus241way.phyloP.bw | sed -e 's/^/ # /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 7,469,351,053 # primaryIndexSize: 89,516,524 # zoomLevels: 10 # chromCount: 24 # basesCovered: 2,852,623,265 # mean: 0.128264 # min: -20.000000 # max: 9.280000 # std: 1.252659 # compared to the loaded table: time wigTableStats.sh hg38 phyloP241way # db.table min max mean count sumData stdDev # hg38.phyloP241way -20 9.28 0.128264 2852623265 3.6589e+08 1.25266 # viewLimits=-6.13503:6.39156 # real 0m15.185s md5sum hg38.cactus* > md5sum.txt # construct a README.txt file # link to download: mkdir -p /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/maf ln -s `pwd`/hg38.*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ln -s `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ln -s `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ln -s `pwd`/hg38*.wigFix.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ln -s `pwd`/maf/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/maf # file list for redmine 27519 /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/*.txt /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/hg38.cactus241way.* /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/maf/* /gbdb/hg38/cactus241way/maf/* /gbdb/hg38/cactus241way/phyloP241way.wib # table list for redmine 27519 hg38.cactus241way hg38.cactus241wayFrames hg38.cactus241waySummary hg38.phyloP241way + +############################################################################# +# markd 2020/12/23 + +bigMaf was built from files in: + /hive/data/genomes/hg38/bed/cactus241way/ucscNames/ + +creating + /hive/data/genomes/hg38/bed/cactus241way/241-mammalian-2020v2.ucsc.bigMaf + +Exact commands were not recorded, since this was originally not intended for release. +It consisted of concatenating the MAFs in sort ordered (using file names) and +then doing a standard bigMaf build. Recall it took about five days. + + +#############################################################################