e941783cbc9cd3ec6af5f75b6969554fba9282e1 hiram Thu Apr 29 23:05:28 2021 -0700 adding frames procedure refs #11636 diff --git src/hg/makeDb/doc/hg38/cactus241way.txt src/hg/makeDb/doc/hg38/cactus241way.txt index 48805f9..95d3fcc 100644 --- src/hg/makeDb/doc/hg38/cactus241way.txt +++ src/hg/makeDb/doc/hg38/cactus241way.txt @@ -1,948 +1,1013 @@ ######################################################################### # Zoonomia phyloP for 241-way from (2021-04-24 MarkD) cd /hive/data/genomes/hg38/bed/cactus241way/phyloP # Obtain phylogP scores computed by Michael Dong (Uppsala U). Since Broad # has had their ftp and http sharing of files turned off, they were placed # in google drive (LoL) # download from https://drive.google.com/drive/u/0/folders/1Xc215oxu0cvBZOarxFv2L3HAjPmiZ-0A # to directory uppsala # format is, so convert with awk chr1 10074 10075 id-1 0.053000 chr1 10075 10076 id-2 0.064000 chr1 10076 10077 id-3 0.064000 # convert to wig and wib zcat uppsala/chr*.bed.gz | tawk '{print $1,$2,$3,$5}' | wigEncode stdin phyloP241way.wig phyloP241way.wib ln -s $(pwd)/phyloP241way.wib /gbdb/hg38/cactus241way/ hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus241way hg38 phyloP241way phyloP241way.wig ######################################################################## # phastCons 241-way (TBD - 2015-05-07 - Hiram) # split 241way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/hg38/bed/cactus241way/cons/ss mkdir -p /hive/data/genomes/hg38/bed/cactus241way/cons/msa.split cd /hive/data/genomes/hg38/bed/cactus241way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/hg38/bed/cactus241way/ucscNames/$c.maf set WINDOWS = /hive/data/genomes/hg38/bed/cactus241way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 3000000,0 -I 300 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh printf '#LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP ' > template # do the easy ones first to see some immediate results ls -1S -r ../../ucscNames | sed -e "s/.maf//;" > maf.list # all can finish OK at a 64Gb memory limit gensub2 maf.list single template jobList para -ram=64g create jobList para try ... check ... etc para push # Completed: 54 of 54 jobs # CPU time in finished jobs: 24295s 404.91m 6.75h 0.28d 0.001 y # IO & Wait Time: 1111s 18.52m 0.31h 0.01d 0.000 y # Average job time: 470s 7.84m 0.13h 0.01d # Longest finished job: 2724s 45.40m 0.76h 0.03d # Submission to last job: 2765s 46.08m 0.77h 0.03d # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/hg38/bed/cactus241way/cons/run.cons cd /hive/data/genomes/hg38/bed/cactus241way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/hg38/bed/cactus241way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP ' > template ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list wc -l ss.list # 930 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/hg38/bed/cactus241way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push # Completed: 930 of 930 jobs # CPU time in finished jobs: 22874s 381.23m 6.35h 0.26d 0.001 y # IO & Wait Time: 6698s 111.64m 1.86h 0.08d 0.000 y # Average job time: 32s 0.53m 0.01h 0.00d # Longest finished job: 49s 0.82m 0.01h 0.00d # Submission to last job: 200s 3.33m 0.06h 0.00d # create Most Conserved track cd /hive/data/genomes/hg38/bed/cactus241way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do echo $C 1>&2 ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m43.805s # -rw-rw-r-- 1 164925477 Dec 22 14:02 tmpMostConserved.bed time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m26.890s # -rw-rw-r-- 1 169120947 Dec 22 14:04 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/hg38/bed/cactus241way/cons/all time hgLoadBed hg38 phastConsElements241way mostConserved.bed # Read 4829247 elements of size 5 from mostConserved.bed # real 0m36.697s # Try for 5% overall cov, and 70% CDS cov # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits hg38 -enrichment ncbiRefSeq:cds phastConsElements241way # ncbiRefSeq:cds 1.408%, phastConsElements241way 5.679%, both 0.993%, cover 70.51%, enrich 12.42x # real 0m31.631s time featureBits hg38 -enrichment refGene:cds phastConsElements241way # refGene:cds 1.333%, phastConsElements241way 5.679%, both 0.972%, cover 72.87%, enrich 12.83x # real 0m24.183s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/hg38/bed/cactus241way/cons/all mkdir downloads time for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" 1>&2 find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons241way.wigFix.gz done XXX - running - Tue Dec 22 14:11:26 PST 2020 # real 32m29.089s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons241way.wig phastCons241way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 15m40.010s du -hsc *.wi? # 2.8G phastCons241way.wib # 283M phastCons241way.wig # encode into a bigWig file: # (warning wigToBigWig process may be too large for memory limits # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin \ ../../../../chrom.sizes phastCons241way.bw) > bigWig.log 2>&1 egrep "VmPeak|real" bigWig.log # pid=37111: VmPeak: 33886864 kB # real 42m13.614s # -rw-rw-r-- 1 7077152013 Nov 6 08:52 phastCons241way.bw bigWigInfo phastCons241way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 5,097,637,987 primaryIndexSize: 93,372,648 zoomLevels: 10 chromCount: 355 basesCovered: 2,955,660,600 mean: 0.128025 min: 0.000000 max: 1.000000 std: 0.247422 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file mkdir /gbdb/hg38/bbi ln -s `pwd`/phastCons241way.bw /gbdb/hg38/bbi hgsql hg38 -e 'drop table if exists phastCons241way; \ create table phastCons241way (fileName varchar(255) not null); \ insert into phastCons241way values ("/gbdb/hg38/bbi/phastCons241way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/hg38/bed/cactus241way/cons/all ln -s `pwd`/phastCons241way.wib /gbdb/hg38/cactus241way/phastCons241way.wib time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus241way hg38 \ phastCons241way phastCons241way.wig # real 0m32.272s time wigTableStats.sh hg38 phastCons241way # db.table min max mean count sumData # hg38.phastCons241way 0 1 0.128025 2955660600 3.78397e+08 # stdDev viewLimits # 0.247422 viewLimits=0:1 # real 0m13.507s # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/hg38/bed/cactus241way/cons/all time hgWiggle -doHistogram -db=hg38 \ -hBinSize=0.001 -hBinCount=300 -hMinVal=0.0 -verbose=2 \ phastCons241way > histogram.data 2>&1 # real 2m38.952s # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm39 Histogram phastCons241way track" set xlabel " phastCons241way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png # take a look to see if it is sane: display histo.png & ######################################################################### # phyloP for 241-way (TBD - 2017-11-06 - Hiram) # # split SS files into 1M chunks, this business needs smaller files # to complete ssh ku mkdir /hive/data/genomes/hg38/bed/cactus241way/consPhyloP cd /hive/data/genomes/hg38/bed/cactus241way/consPhyloP mkdir ss run.split cd run.split printf '#!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/hg38/bed/cactus241way/ucscNames/$c.maf set WINDOWS = /hive/data/genomes/hg38/bed/cactus241way/consPhyloP/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running ' > doSplit.csh chmod +x doSplit.csh # do the easy ones first to see some immediate results ls -1S -r ../../ucscNames | sed -e "s/.maf//;" > maf.list # this needs a {check out line+ $(root1.done)} test for verification: printf '#LOOP ./doSplit.csh $(root1) $(root1).done #ENDLOOP ' > template gensub2 maf.list single template jobList # all can complete successfully at the 64Gb memory limit para -ram=64g create jobList para try ... check ... push ... etc... # Completed: 54 of 54 jobs # CPU time in finished jobs: 25321s 422.01m 7.03h 0.29d 0.001 y # IO & Wait Time: 843s 14.06m 0.23h 0.01d 0.000 y # Average job time: 485s 8.08m 0.13h 0.01d # Longest finished job: 2873s 47.88m 0.80h 0.03d # Submission to last job: 2931s 48.85m 0.81h 0.03d # run phyloP with score=LRT ssh ku mkdir /cluster/data/hg38/bed/cactus241way/consPhyloP cd /cluster/data/hg38/bed/cactus241way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACK ../../4d/all.mod # BACKGROUND: 0.219062 0.338786 0.207231 0.234921 grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.546 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.546 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.217500 0.282500 0.282500 0.217500 printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set ssFile = $1:t set out = $2 set cName = $f:h set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/hg38/bed/cactus241way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $ssFile.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$ssFile.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # make sure the list looks good wc -l ss.list # 3308 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/hg38/bed/cactus241way/consPhyloP/all cd /hive/data/genomes/hg38/bed/cactus241way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs para -ram=32g create jobList para try ... check ... para -maxJob=16 push para time > run.time XXX - running - Tue Dec 22 13:59:08 PST 2020 # Completed: 3308 of 3308 jobs # CPU time in finished jobs: 647954s 10799.23m 179.99h 7.50d 0.021 y # IO & Wait Time: 22374s 372.90m 6.22h 0.26d 0.001 y # Average job time: 203s 3.38m 0.06h 0.00d # Longest finished job: 349s 5.82m 0.10h 0.00d # Submission to last job: 3226s 53.77m 0.90h 0.04d mkdir downloads time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP241way.wigFix.gz done # real 48m50.219s du -hsc downloads # 4.6G downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ phyloP241way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=66292: VmPeak: 33751268 kB # real 43m40.194s bigWigInfo phyloP241way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 6,304,076,591 # primaryIndexSize: 93,404,704 # zoomLevels: 10 # chromCount: 355 # basesCovered: 2,955,660,581 # mean: 0.097833 # min: -20.000000 # max: 1.312000 # std: 0.727453 # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP241way.wig phyloP241way.wib) # Converted stdin, upper limit 1.31, lower limit -20.00 # real 17m36.880s # -rw-rw-r-- 1 2955660581 Nov 6 14:10 phyloP241way.wib # -rw-rw-r-- 1 304274846 Nov 6 14:10 phyloP241way.wig du -hsc *.wi? # 2.8G phyloP241way.wib # 291M phyloP241way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP241way.wib /gbdb/hg38/cactus241way/phyloP241way.wib time hgLoadWiggle -pathPrefix=/gbdb/hg38/cactus241way hg38 \ phyloP241way phyloP241way.wig # real 0m30.538s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh hg38 phyloP241way # db.table min max mean count sumData # hg38.phyloP241way -20 1.312 0.0978331 2955660581 2.89162e+08 # stdDev viewLimits # 0.727453 viewLimits=-3.53943:1.312 # that range is: 20+1.312= 21.312 for hBinSize=0.021312 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.021312 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=hg38 phyloP241way > histogram.data 2>&1 # real 2m43.313s # xaxis range: grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ | sed -e 's/^/# /;' # Q1 -10.953050 # median -6.861155 # Q3 -2.769245 # average -6.875971 # min -20.000000 # max 1.312000 # count 768 # total -5280.745380 # standard deviation 4.757034 # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ | sed -e 's/^/# /;' # Q1 0.000000 # median 0.000001 # Q3 0.000140 # average 0.001302 # min 0.000000 # max 0.023556 # count 768 # total 0.999975 # standard deviation 0.003490 # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human hg38 Histogram phyloP241way track" set xlabel " phyloP241way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xrange [-5:1.5] set yrange [0:0.04] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png # verify it looks sane display histo.png & ############################################################################# # construct download files for 241-way (TBD - 2015-04-15 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons241way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP241way mkdir /hive/data/genomes/hg38/bed/cactus241way/downloads cd /hive/data/genomes/hg38/bed/cactus241way/downloads mkdir cactus241way phastCons241way phyloP241way ######################################################################### ## create upstream refGene maf files cd /hive/data/genomes/hg38/bed/cactus241way/downloads/cactus241way # bash script #!/bin/sh export geneTbl="refGene" for S in 300 2000 5000 do echo "making upstream${S}.maf" featureBits hg38 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags hg38 cactus241way \ stdin stdout \ -orgs=/hive/data/genomes/hg38/bed/cactus241way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 88m40.730s -rw-rw-r-- 1 52659159 Nov 6 11:46 upstream300.ncbiRefSeq.maf.gz -rw-rw-r-- 1 451126665 Nov 6 12:15 upstream2000.ncbiRefSeq.maf.gz -rw-rw-r-- 1 1080533794 Nov 6 12:55 upstream5000.ncbiRefSeq.maf.gz ###################################################################### ## compress the maf files cd /hive/data/genomes/hg38/bed/cactus241way/downloads/cactus241way mkdir maf rsync -a -P ../../ucscNames/ ./maf/ du -hsc maf/ # 156G maf cd maf time gzip *.maf & # real 135m1.784s du -hscL maf ../../ucscNames/ # 18G maf cd maf md5sum *.maf.gz *.nh > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/maf cd maf ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/maf cd -- ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ########################################################################### cd /hive/data/genomes/hg38/bed/cactus241way/downloads/cactus241way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.241way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh hg38.241way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.241way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh hg38.241way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.241way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m3.147s ln -s `pwd`/*.maf.gz `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way du -hsc ./maf ../../ucscNames # 18G ./maf # 156G ../../ucscNames # obtain the README.txt from hg38/multiz20way and update for this # situation ln -s `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/cactus241way/ ##################################################################### cd /hive/data/genomes/hg38/bed/cactus241way/downloads/phastCons241way mkdir hg38.241way.phastCons cd hg38.241way.phastCons ln -s ../../../cons/all/downloads/*.wigFix.gz . md5sum *.gz > md5sum.txt cd /hive/data/genomes/hg38/bed/cactus241way/downloads/phastCons241way ln -s ../../cons/all/phastCons241way.bw ./hg38.phastCons241way.bw ln -s ../../cons/all/all.mod ./hg38.phastCons241way.mod time md5sum *.mod *.bw > md5sum.txt # real 0m20.354s # obtain the README.txt from hg38/phastCons20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons241way/hg38.241way.phastCons cd hg38.241way.phastCons ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons241way/hg38.241way.phastCons cd .. # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phastCons241way ##################################################################### cd /hive/data/genomes/hg38/bed/cactus241way/downloads/phyloP241way mkdir hg38.241way.phyloP cd hg38.241way.phyloP ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . md5sum *.wigFix.gz > md5sum.txt cd .. ln -s ../../consPhyloP/run.phyloP/all.mod hg38.phyloP241way.mod ln -s ../../consPhyloP/all/phyloP241way.bw hg38.phyloP241way.bw md5sum *.mod *.bw > md5sum.txt # Obtain The README.txt from hg38/phyloP20way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP241way/hg38.241way.phyloP cd hg38.241way.phyloP ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP241way/hg38.241way.phyloP cd .. # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/hg38/phyloP241way ############################################################################# # hgPal downloads (TBD - 2017-11-06 - Hiram) # FASTA from 241-way for ncbiRefSeq, refGene and knownCanonical ssh hgwdev screen -S hg38HgPal mkdir /hive/data/genomes/hg38/bed/cactus241way/pal cd /hive/data/genomes/hg38/bed/cactus241way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list ### knownCanonical with full CDS cd /hive/data/genomes/hg38/bed/cactus241way/pal export mz=cactus241way export gp=knownCanonical export db=hg38 mkdir exonAA exonNuc knownCanonical time cut -f1 ../../../chrom.sizes | while read C do echo $C 1>&2 hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed done ls knownCanonical/*.known.bed | while read F do if [ -s $F ]; then echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' fi done | while read C do echo "date" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -noTrans $db $mz ncbiRefSeq order.list stdout | \ gzip -c > protNuc/$C.protNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed $db $mz ncbiRefSeq order.list stdout | \ gzip -c > protAA/$C.protAA.fa.gz" done > $gp.$mz.prot.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 # 267m58.813s rm *.known.bed export mz=cactus241way export gp=knownCanonical export db=hg38 zcat protAA/c*.gz | gzip -c > $gp.$mz.protAA.fa.gz & zcat protNuc/c*.gz | gzip -c > $gp.$mz.protNuc.fa.gz & # about 6 minutes ### knownCanonical broken up by exon cd /hive/data/genomes/hg38/bed/multiz100way/pal export mz=multiz100way export gp=knownCanonical export db=hg38 mkdir exonAA exonNuc knownCanonical time cut -f1 ../../../chrom.sizes | while read C do echo $C 1>&2 hgsql hg38 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed done # real 0m15.897s ls knownCanonical/*.known.bed | while read F do if [ -s $F ]; then echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' fi done | while read C do echo "date" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz ncbiRefSeq order.list stdout | \ gzip -c > exonNuc/$C.exonNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz ncbiRefSeq order.list stdout | \ gzip -c > exonAA/$C.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 # 267m58.813s rm *.known.bed export mz=cactus241way export gp=knownCanonical export db=hg38 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & # about 6 minutes rm -rf exonAA exonNuc export mz=multiz100way export gp=knownCanonical export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/$gp.$mz.protAA.fa.gz $pd/$gp.protAA.fa.gz ln -s `pwd`/$gp.$mz.protNuc.fa.gz $pd/$gp.protNuc.fa.gz cd $pd md5sum *.fa.gz > md5sum.txt rm -rf exonAA exonNuc export mz=cactus241way export gp=knownCanonical export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # ncbiRefSeq export mz=cactus241way export gp=ncbiRefSeq export db=hg38 export I=0 export D=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/300)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1 # real 79m18.323s export mz=cactus241way export gp=ncbiRefSeq time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m28.841s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 3m56.370s # -rw-rw-r-- 1 397928833 Nov 6 18:44 ncbiRefSeq.cactus241way.exonAA.fa.gz # -rw-rw-r-- 1 580377720 Nov 6 18:49 ncbiRefSeq.cactus241way.exonNuc.fa.gz export mz=cactus241way export gp=ncbiRefSeq export db=hg38 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ cd $pd md5sum *.fa.gz > md5sum.txt rm -rf exonAA exonNuc ############################################################################# # wiki page for 241-way (TBD - 2017-11-06 - Hiram) mkdir /hive/users/hiram/bigWays/hg38.241way cd /hive/users/hiram/bigWays echo "hg38" > hg38.241way/ordered.list awk '{print $1}' /hive/data/genomes/hg38/bed/cactus241way/241way.distances.txt \ >> hg38.241way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh hg38.241way/ordered.list # dbDb.sh constructs hg38.241way/XenTro9_241-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh hg38 241way # sizeStats.pl constructs hg38.241way/XenTro9_241-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl hg38 241way # defCheck.pl constructs XenTro9_241-way_conservation_lastz_parameters.html ./defCheck.pl hg38 241way # this constructs the html pages in hg38.241way/: # -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_241-way_conservation_alignment.html # -rw-rw-r-- 1 8430 May 2 17:09 XenTro9_241-way_Genome_size_statistics.html # -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_241-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # XenTro9_241-way_conservation_alignment # XenTro9_241-way_Genome_size_statistics # XenTro9_241-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################ # pushQ readmine (TBD - 2017-11-07 - Hiram) cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38 find -L `pwd`/cactus241way `pwd`/phastCons241way `pwd`/phyloP241way \ /gbdb/hg38/cactus241way -type f \ > /hive/data/genomes/hg38/bed/cactus241way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/hg38/bed/cactus241way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/hg38/bed/cactus241way/downloads/redmine.20216.fileList cd /hive/data/genomes/hg38/bed/cactus241way/downloads hgsql -e 'show tables;' hg38 | grep 241way \ | sed -e 's/^/hg38./;' > redmine.20216.table.list ############################################################################ +## adding frames ( DONE - 2021-04-29 - Hiram ) + + mkdir /hive/data/genomes/hg38/bed/cactus241way/frames + cd /hive/data/genomes/hg38/bed/cactus241way/frames + mkdir genes + + hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg38 \ + | genePredSingleCover stdin stdout | gzip -2c \ + > genes/hg38.gp.gz + + genePredCheck -db=hg38 genes/hg38.gp.gz + + # checked: 19328 failed: 0 + + ls ../ucscNames | sed -e 's/.maf//;' > chr.list + ls genes | sed -e 's/.gp.gz//;' > gene.list + + printf '#!/bin/bash + +set -beEu -o pipefail + +export C=$1 +export G=$2 + +cat ../ucscNames/${C}.maf | genePredToMafFrames hg38 stdin stdout \ + "${G}" genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz + +' > runOne + + chmod +x rnOne + + printf '#LOOP +./runOne $(root1) $(root2) parts/$(root1).$(root2).mafFrames.gz +#ENDLOOP +' > template + + gensub2 chr.list gene.list template perl.jobList + + time ($HOME/kent/src/hg/utils/automation/perlPara.pl 4 perl.jobList) \ + >> do.log 2>&1 & + + tail do.log + +# Completed: 454 of 454 jobs +# CPU time in finished jobs: 91822s 1530.37m 25.51h 1.06d 0.003y +# Average job time: 202s 3.37m 0.06h 0.00d +# Longest finished job: 7570s 126.17m 2.10h 0.09d + + # real 417m27.701s + + time find ./parts -type f | while read F +do + echo "${F}" 1>&2 + zcat ${F} +done | sort -k1,1 -k2,2n | gzip -c > cactus241wayFrames.bed.gz + + # real 0m3.178s + + hgLoadMafFrames hg38 cactus241wayFrames cactus241wayFrames.bed.gz + + featureBits -countGaps hg38 cactus241wayFrames + + # 33621579 bases of 3272116950 (1.028%) in intersection + +############################################################################