066250f183599972b357caf9d3261aad59451ad9 hiram Mon Aug 29 16:19:11 2022 -0700 with phyloP histogram refs #29898 diff --git src/hg/makeDb/doc/hg38/multiz470way.txt src/hg/makeDb/doc/hg38/multiz470way.txt index 2d63cee..c6c610c 100644 --- src/hg/makeDb/doc/hg38/multiz470way.txt +++ src/hg/makeDb/doc/hg38/multiz470way.txt @@ -166,35 +166,36 @@ mafToBigMaf hg38 "${F}" stdout >> allOne.maf done # real 97m38.303s # user 84m22.503s # sys 11m37.993s # then sort that file: time $HOME/bin/x86_64/gnusort -S1024G --parallel=64 -k1,1 -k2,2n allOne.maf \ > hg38.470way.bigMaf # real 663m58.660s # user 38m28.432s # sys 300m35.371s # and run bedToBigBed on it: time (bedToBigBed -verbose=2 -itemsPerSlot=4 -type=bed3+1 -as=$HOME/kent/src/hg/lib/bigMaf.as -tab hg38.470way.bigMaf /hive/data/genomes/hg38/chrom.sizes hg38.470way.bb) >> toBb.log 2>&1 + # pid=227524: VmPeak: 5221776 kB + # real 3441m36.866s - XXX #### the bedToBigBed is running Thu Aug 18 12:52:16 PDT 2022 # obtained a phylo tree from Michael Hiller with 508 species --rw-rw-r-- 1 22483 Aug 18 10:42 tree508.nh +# -rw-rw-r-- 1 22483 Aug 18 10:42 tree508.nh ~/kent/src/hg/utils/phyloTrees/speciesList.sh tree508.nh \ | sort > 508.species join -v2 nameLists/native.470.list 508.species > not.in.maf.list /cluster/bin/phast/tree_doctor \ --prune `cat not.in.maf.list | xargs echo | tr ' ' ','` \ tree508.nh | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl \ /dev/stdin > hg38.470way.nh # using TreeGraph2 tree editor on the Mac, rearrange to get hg38 # at the top, and attempt to get the others in phylo order: /cluster/bin/phast/all_dists hg38.470way.nh | grep hg38 \ | sed -e "s/hg38.//" | sort -k2n | sed -e 's/^/#\t/;' | head # panTro6 0.011908 # panPan3 0.011944 @@ -206,31 +207,30 @@ # HLtheGel1 0.066537 # HLmacFas6 0.066886 ... tail # HLpseCor1 0.811597 # HLmacFul1 0.812234 # HLnotEug3 0.815561 # HLospRuf1 0.815802 # HLpseOcc1 0.819866 # macEug2 0.821901 # HLantFla1 0.825093 # HLsarHar2 0.826368 # HLornAna3 1.011442 # HLtacAcu1 1.023432 - # what that looks like: head hg38.470way.nh | sed -e 's/^/# /;' # ((((((((((((((((hg38:0.005962, # (panPan3:0.001895, # panTro6:0.001859):0.004087):0.002083, # gorGor6:0.00958):0.008775, # ponAbe3:0.017185):0.002844, # (HLhylMol2:0.00707, # HLnomLeu4:0.006838):0.013694):0.010338, # ((((((rheMac10:0.001889, # HLmacFus1:0.002063):0.001806, # HLmacFas6:0.00211):0.001812, tail hg38.470way.nh | sed -e 's/^/# /;' @@ -262,136 +262,115 @@ # edited db.to.name.txt to change - to _ in some of the names. # e.g. Crab-eating -> Crab_eating, # the Crab-eating didn't survive the tree_doctor /cluster/bin/phast/tree_doctor --rename "`cat db.comName.txt`" hg38.470way.nh \ | sed -e 's/0\+)/)/g; s/0\+,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ | sed -e "s#_x_#'#g;" > hg38.470way.commonNames.nh /cluster/bin/phast/tree_doctor --rename "`cat nameLists/db.to.sciName.txt`" \ hg38.470way.nh \ | sed -e 's/0\+)/)/g; s/0\+,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ | sed -e "s#_x_#'#g;" > hg38.470way.scientificNames.nh - cat hg38.470way.commonNames.nh | sed -e 's/^/# /;' -# ((((((((((((Human:0.00655, -# Chimp:0.00684):0.00122, -# Bonobo:0.00784):0.003, -# Gorilla:0.008964):0.009693, -# Orangutan:0.01894):0.003471, -# Gibbon:0.02227):0.01204, -# (((((Rhesus:0.003991, -# (Crab_eating_macaque:0.002991, -# Pig_tailed_macaque:0.005):0.001):0.001, -# Sooty_mangabey:0.008):0.005, -# Baboon:0.010042):0.01061, -# (Green_monkey:0.027, -# Drill:0.03):0.002):0.003, -# ((Proboscis_monkey:0.0007, -# Angolan_colobus:0.0008):0.0008, -# (Golden_snub:0.0007, -# Black_snub:0.0007):0.0008):0.018):0.02):0.02183, -# (((Marmoset:0.03, -# Squirrel_monkey:0.01035):0.00865, -# White_faced_sapajou:0.04):0.006, -# Ma's_night_monkey:0.04):0.005):0.05209, -# Tarsier:0.1114):0.02052, -# (((Mouse_lemur:0.0556, -# Coquerel's_sifaka:0.05):0.015, -# (Black_lemur:0.01, -# Sclater's_lemur:0.01):0.015):0.015, -# Bushbaby:0.1194):0.02052):0.015494, -# Mouse:0.356483):0.020593, -# Dog:0.165928):0.023664, -# Armadillo:0.176526); + head hg38.470way.commonNames.nh | sed -e 's/^/# /;' +# ((((((((((((((((human:0.005962, +# (pygmy_chimpanzee:0.001895, +# chimpanzee:0.001859):0.004087):0.002083, +# western_lowland_gorilla:0.00958):0.008775, +# Sumatran_orangutan:0.017185):0.002844, +# (silvery_gibbon:0.00707, +# northern_white_cheeked_gibbon:0.006838):0.013694):0.010338, +# ((((((Rhesus_monkey:0.001889, +# Japanese_macaque:0.002063):0.001806, +# crab_eating_macaque:0.00211):0.001812, + + tail hg38.470way.commonNames.nh | sed -e 's/^/# /;' +# (golden_ringtail_possum:0.017988, +# coppery_ringtail_possum:0.017013):0.023448):0.016208):0.028312):0.010708):0.01957, +# (Tasmanian_wolf:0.042212, +# (yellow_footed_antechinus:0.026702, +# Tasmanian_devil:0.027977):0.032656):0.070372):0.030736, +# (North_American_opossum:0.053914, +# (gray_short_tailed_opossum:0.055461, +# Agile_Gracile_Mouse_Opossum:0.042801):0.004824):0.079265):0.23872):0.526739, +# Australian_echidna:0.070786):0.029398, +# platypus:0.029398); + + # this tree is too large to make a convenient image # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/hg38_470way.png ~/kent/src/hg/utils/phyloTrees/asciiTree.pl hg38.470way.nh > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > hg38.470way.scientificNames.nh rm -f t.nh - cat hg38.470way.scientificNames.nh | sed -e 's/^/# /;' -# ((((((((((((Homo_sapiens:0.00655, -# Pan_troglodytes:0.00684):0.00122, -# Pan_paniscus:0.00784):0.003, -# Gorilla_gorilla_gorilla:0.008964):0.009693, -# Pongo_pygmaeus_abelii:0.01894):0.003471, -# Nomascus_leucogenys:0.02227):0.01204, -# (((((Macaca_mulatta:0.003991, -# (Macaca_fascicularis:0.002991, -# Macaca_nemestrina:0.005):0.001):0.001, -# Cercocebus_atys:0.008):0.005, -# Papio_anubis:0.010042):0.01061, -# (Chlorocebus_sabaeus:0.027, -# Mandrillus_leucophaeus:0.03):0.002):0.003, -# ((Nasalis_larvatus:0.0007, -# Colobus_angolensis_palliatus:0.0008):0.0008, -# (Rhinopithecus_roxellana:0.0007, -# Rhinopithecus_bieti:0.0007):0.0008):0.018):0.02):0.02183, -# (((Callithrix_jacchus:0.03, -# Saimiri_boliviensis:0.01035):0.00865, -# Cebus_capucinus_imitator:0.04):0.006, -# Aotus_nancymaae:0.04):0.005):0.05209, -# Tarsius_syrichta:0.1114):0.02052, -# (((Microcebus_murinus:0.0556, -# Propithecus_coquereli:0.05):0.015, -# (Eulemur_macaco:0.01, -# Eulemur_flavifrons:0.01):0.015):0.015, -# Otolemur_garnettii:0.1194):0.02052):0.015494, -# Mus_musculus:0.356483):0.020593, -# Canis_lupus_familiaris:0.165928):0.023664, -# Dasypus_novemcinctus:0.176526); + head hg38.470way.scientificNames.nh | sed -e 's/^/# /;' +# ((((((((((((((((Homo_sapiens:0.005962, +# (Pan_paniscus:0.001895, +# Pan_troglodytes:0.001859):0.004087):0.002083, +# Gorilla_gorilla_gorilla:0.00958):0.008775, +# Pongo_abelii:0.017185):0.002844, +# (Hylobates_moloch:0.00707, +# Nomascus_leucogenys:0.006838):0.013694):0.010338, +# ((((((Macaca_mulatta:0.001889, +# Macaca_fuscata:0.002063):0.001806, +# Macaca_fascicularis:0.00211):0.001812, + + tail hg38.470way.scientificNames.nh | sed -e 's/^/# /;' +# (Pseudochirops_corinnae:0.017988, +# Pseudochirops_cupreus:0.017013):0.023448):0.016208):0.028312):0.010708):0.01957, +# (Thylacinus_cynocephalus:0.042212, +# (Antechinus_flavipes:0.026702, +# Sarcophilus_harrisii:0.027977):0.032656):0.070372):0.030736, +# (Didelphis_virginiana:0.053914, +# (Monodelphis_domestica:0.055461, +# Gracilinanus_agilis:0.042801):0.004824):0.079265):0.23872):0.526739, +# Tachyglossus_aculeatus:0.070786):0.029398, +# Ornithorhynchus_anatinus:0.029398); /cluster/bin/phast/all_dists hg38.470way.nh | grep hg38 \ | sed -e "s/hg38.//" | sort -k2n > 470way.distances.txt # Use this output to create the table below - cat 470way.distances.txt | sed -e 's/^/# /;' -# panTro5 0.013390 -# panPan2 0.015610 -# gorGor5 0.019734 -# ponAbe2 0.039403 -# nomLeu3 0.046204 -# nasLar1 0.075474 -# rhiBie1 0.075474 -# rhiRox1 0.075474 -# colAng1 0.075574 -# macFas5 0.079575 -# rheMac8 0.079575 -# papAnu3 0.079626 -# macNem1 0.081584 -# cerAty1 0.082584 -# saiBol1 0.087804 -# chlSab2 0.087974 -# manLeu1 0.090974 -# aotNan1 0.102804 -# calJac3 0.107454 -# cebCap1 0.108804 -# eulFla1 0.190934 -# eulMac1 0.190934 -# tarSyr2 0.221294 -# proCoq1 0.2470934 -# micMur3 0.236534 -# otoGar3 0.270334 -# canFam3 0.332429 -# dasNov3 0.366691 -# mm10 0.502391 + head 470way.distances.txt | sed -e 's/^/# /;' +# panTro6 0.011908 +# panPan3 0.011944 +# gorGor6 0.017625 +# ponAbe3 0.034005 +# HLnomLeu4 0.040196 +# HLhylMol2 0.040428 +# macNem1 0.066302 +# HLtheGel1 0.066537 +# HLmacFas6 0.066886 +# HLcerMon1 0.067960 + + tail 470way.distances.txt | sed -e 's/^/# /;' +# HLpseCor1 0.811597 +# HLmacFul1 0.812234 +# HLnotEug3 0.815561 +# HLospRuf1 0.815802 +# HLpseOcc1 0.819866 +# macEug2 0.821901 +# HLantFla1 0.825093 +# HLsarHar2 0.826368 +# HLornAna3 1.011442 +# HLtacAcu1 1.023432 printf '#!/usr/bin/env perl use strict; use warnings; open (FH, "<470way.distances.txt") or die "can not read 470way.distances.txt"; my $count = 0; while (my $line = ) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/hg38/bed/lastz.$D/fb.hg38." . @@ -730,31 +709,33 @@ -mergeGap=1500 -maxSize=200000 hg38 multiz470waySummary stdin) # Created 4568973 summary blocks from 850984320 components and 40625470 mafs from stdin # real 49m52.561s -rw-rw-r-- 1 2171190193 Nov 2 16:40 multiz470way.tab -rw-rw-r-- 1 215756735 Nov 2 17:44 multiz470waySummary.tab wc -l multiz470*.tab # 40625470 multiz470way.tab # 4568973 multiz470waySummary.tab rm multiz470way*.tab ####################################################################### -# GAP ANNOTATE MULTIZ470WAY MAF AND LOAD TABLES (DONE - 2017-11-03 - Hiram) +# GAP ANNOTATE MULTIZ470WAY MAF AND LOAD TABLES (DONE - 2022-08-01 - Hiram) + # XXX the files received from Michael Hiller already had the iRows installed + # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/hg38/bed/multiz470way/anno cd /hive/data/genomes/hg38/bed/multiz470way/anno # check for N.bed files everywhere: for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" @@ -822,31 +803,31 @@ | hgLoadMafSummary -verbose=2 -minSize=470000 \ -mergeGap=1500 -maxSize=200000 hg38 multiz470waySummary stdin) # Created 4568973 summary blocks from 850984320 components and 40655883 mafs from stdin # real 59m27.383s # -rw-rw-r-- 1 2177747201 Nov 2 18:27 multiz470way.tab # -rw-rw-r-- 1 224894681 Nov 3 08:12 multiz470waySummary.tab wc -l multiz470way*.tab # 40655883 multiz470way.tab # 4568973 multiz470waySummary.tab rm multiz470way*.tab ############################################################################## -# MULTIZ7WAY MAF FRAMES (DONE - 2017-11-03 - Hiram) +# MULTIZ7WAY MAF FRAMES (DONE - 2022-08-26 - Hiram) ssh hgwdev mkdir /hive/data/genomes/hg38/bed/multiz470way/frames cd /hive/data/genomes/hg38/bed/multiz470way/frames mkdir genes # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`grep -v HL ../species.list.txt`) echo -n "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then @@ -1113,31 +1094,31 @@ time featureBits -countGaps hg38 multiz470wayFrames # 87979395 bases of 3272116950 (2.689%) in intersection # real 2m42.946s # enable the trackDb entries: # frames https://hgdownload.soe.ucsc.edu/goldenPath/hg38/multiz470way/multiz470wayFrames.bb # irows on # zoom to base level in an exon to see codon displays # appears to work OK # do not need this loaded table: hgsql hg38 -e 'drop table multiz470wayFrames;' ######################################################################### -# Phylogenetic tree from 470-way (DONE - 2013-09-13 - Hiram) +# Phylogenetic tree from 470-way (DONE - 2022-08-26 - Hiram) mkdir /hive/data/genomes/hg38/bed/multiz470way/4d cd /hive/data/genomes/hg38/bed/multiz470way/4d # the annotated maf's are in: ../perChrom/*.maf # using ncbiRefSeq for hg38, only transcribed genes and nothing # from the randoms and other misc. hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" hg38 \ | egrep -E -v "chrM|chrUn|random|_alt|_fix" > ncbiRefSeq.gp wc -l *.gp # 129657 ncbiRefSeq.gp # verify it is only on the chroms: cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' @@ -1711,42 +1692,42 @@ # Submission to last job: 12627s 210.45m 3.51h 0.15d # finished those crashed 24 jobs manually, they took hundreds of Gb # of memory and all day to finish # run phyloP with score=LRT ssh ku mkdir /cluster/data/hg38/bed/multiz470way/consPhyloP cd /cluster/data/hg38/bed/multiz470way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACK ../../4d/all.mod - # BACKGROUND: 0.207173 0.3284701 0.237184 0.227343 + # BACKGROUND: 0.226533 0.344573 0.178804 0.250090 grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' - # 0.565 - /cluster/bin/phast.build/cornellCVS/phast.2010-12-470/bin/modFreqs \ - ../../4d/all.mod 0.565 > all.mod + # 0.523 + /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin/modFreqs \ + ../../4d/all.mod 0.523 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod - # BACKGROUND: 0.217500 0.282500 0.282500 0.217500 + # BACKGROUND: 0.238500 0.261500 0.261500 0.238500 printf '#!/bin/csh -fe -set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-470/bin +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2018-03-29/bin set f = $1 set ssFile = $1:t set out = $2 set cName = $f:h set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/hg38/bed/multiz470way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss @@ -1756,159 +1737,158 @@ /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$ssFile.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # make sure the list looks good wc -l ss.list - # 34708 ss.list + # 3468 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/hg38/bed/multiz470way/consPhyloP/all cd /hive/data/genomes/hg38/bed/multiz470way/consPhyloP/all - rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overloading the cluster with these quick and high I/O jobs - para -ram=32g create jobList + para -ram=16g create jobList para try ... check ... para -maxJob=16 push para time > run.time - +XXX some of the jobs fail with an error in phyloP # Completed: 34708 of 34708 jobs # CPU time in finished jobs: 647954s 10799.23m 179.99h 7.50d 0.021 y # IO & Wait Time: 22374s 372.90m 6.22h 0.26d 0.001 y # Average job time: 203s 3.38m 0.06h 0.00d # Longest finished job: 349s 5.82m 0.10h 0.00d # Submission to last job: 3226s 53.77m 0.90h 0.04d mkdir downloads time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP470way.wigFix.gz done # real 48m50.219s du -hsc downloads - # 4.6G downloads + # 5.8G downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ phyloP470way.bw) > bigWig.log 2>&1 - egrep "real|VmPeak" bigWig.log # pid=66292: VmPeak: 33751268 kB # real 43m40.194s - bigWigInfo phyloP470way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 -# primaryDataSize: 6,4704,076,591 -# primaryIndexSize: 93,404,704 +# primaryDataSize: 8,691,460,678 +# primaryIndexSize: 135,762,716 # zoomLevels: 10 -# chromCount: 355 -# basesCovered: 2,955,660,581 -# mean: 0.097833 +# chromCount: 466 +# basesCovered: 2,828,476,393 +# mean: -0.004698 # min: -20.000000 -# max: 1.312000 -# std: 0.727453 +# max: 11.936000 +# std: 1.758256 + # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP470way.wig phyloP470way.wib) +XXX - running - Mon Aug 29 15:28:59 PDT 2022 +# Converted stdin, upper limit 11.94, lower limit -20.00 +# real 8m32.643s -# Converted stdin, upper limit 1.31, lower limit -20.00 -# real 17m36.880s -# -rw-rw-r-- 1 2955660581 Nov 6 14:10 phyloP470way.wib -# -rw-rw-r-- 1 4704274846 Nov 6 14:10 phyloP470way.wig +# -rw-rw-r-- 1 2828476393 Aug 29 15:37 phyloP470way.wib +# -rw-rw-r-- 1 446336215 Aug 29 15:37 phyloP470way.wig du -hsc *.wi? - # 2.8G phyloP470way.wib - # 291M phyloP470way.wig + # 2.7G phyloP470way.wib + # 426M phyloP470way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP470way.wib /gbdb/hg38/multiz470way/phyloP470way.wib time hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz470way hg38 \ phyloP470way phyloP470way.wig - # real 0m470.538s + # real 0m51.154s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh hg38 phyloP470way # db.table min max mean count sumData -# hg38.phyloP470way -20 1.312 0.0978331 2955660581 2.89162e+08 +# hg38.phyloP470way -20 11.936 -0.00469849 2828476393 -1.32896e+07 +# 1.75826 viewLimits=-8.79598:8.78658 # stdDev viewLimits -# 0.727453 viewLimits=-3.53943:1.312 - # that range is: 20+1.312= 21.312 for hBinSize=0.021312 + # that range is: 20+11.936= 31.936 for hBinSize=0.031936 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ - -hBinSize=0.021312 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ + -hBinSize=0.031936 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=hg38 phyloP470way > histogram.data 2>&1 - # real 2m43.313s + # real 1m31.526s # xaxis range: grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ | sed -e 's/^/# /;' -# Q1 -10.9547050 -# median -6.861155 -# Q3 -2.769245 -# average -6.875971 +# Q1 -9.157730 +# median -2.115840 +# Q3 4.894110 +# average -2.132855 # min -20.000000 -# max 1.312000 -# count 768 -# total -5280.745380 -# standard deviation 4.757034 +# max 11.936000 +# count 883 +# total -1883.311115 +# standard deviation 8.140902 # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ | sed -e 's/^/# /;' -# Q1 0.000000 -# median 0.000001 -# Q3 0.000140 -# average 0.0014702 +# Q1 0.000014 +# median 0.000064 +# Q3 0.000437 +# average 0.001133 # min 0.000000 -# max 0.023556 -# count 768 -# total 0.999975 -# standard deviation 0.003490 +# max 0.020000 +# count 883 +# total 1.000052 +# standard deviation 0.003053 # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human hg38 Histogram phyloP470way track" set xlabel " phyloP470way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xrange [-5:1.5] @@ -2562,53 +2542,59 @@ # count 764 # total 0.999990 # standard deviation 0.003612 # Q1 0.000000 # median 0.000001 # Q3 0.000140 # average 0.0014702 # min 0.000000 # max 0.023556 # count 768 # total 0.999975 # standard deviation 0.003490 # create plot of histogram: - printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ - - printf 'set terminal pngcairo size 900,400 background "#000000" \\ -font "/usr/share/fonts/default/Type1/n022004l.pfb" -set size 1.4, 0.8 -set key left box + # updated for new gnuplot on hgwdev 2018-11-26 (can't get font to change) + printf 'set terminal pngcairo size 1200,600 background "#000000" font "/usr/share/fonts/default/Type1/n022004l.pfb" +set output "hg38.phyloP470.histo.png" +set size 1.0, 1.0 +set style line 1 lt 2 lc rgb "#ff88ff" lw 2 +set style line 2 lt 2 lc rgb "#66ff66" lw 2 +set style line 3 lt 2 lc rgb "#ffff00" lw 2 +set style line 4 lt 2 lc rgb "#ffffff" lw 2 +set border lc rgb "#ffff00" +set key left box ls 3 +set key tc variable set grid noxtics -set grid ytics -set title " Human hg38 Histogram phyloPSSREV470way track" -set xlabel " phyloPSSREV470way score" -set ylabel " Relative Frequency" -set y2label " Cumulative Relative Frequency (CRF)" -set y2range [0:1] -set y2tics -set xrange [-5:1.5] +set grid ytics ls 4 +set y2tics border nomirror out tc rgb "#ffffff" +set ytics border nomirror out tc rgb "#ffffff" +set title " human/hg38 Histogram phyloP470way track" tc rgb "#ffffff" +set xlabel " hg38.phyloP470way score" tc rgb "#ffffff" +set ylabel " Relative Frequency" tc rgb "#ff88ff" +set y2label " Cumulative Relative Frequency (CRF)" tc rgb "#66ff66" +set xrange [-4:2.5] set yrange [0:0.04] +set y2range [0:1] -plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ - "histogram.data" using 2:7 axes x1y2 title " CRF" with lines -' | gnuplot > histo.png +plot "histogram.data" using 2:5 title " RelFreq" with impulses ls 1, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ls 2 +' | gnuplot # verify it looks sane - display histo.png & + display hg38phyloP470.histo.png & ############################################################################# # sequence logo mkdir /hive/data/genomes/hg38/bed/multiz470way/logo cd /hive/data/genomes/hg38/bed/multiz470way/logo mkdir wigs bw ls ../maf/* | awk '{printf "./makeWig.sh %s ../consPhyloP/all/phyloP470way.bw ../../../chrom.sizes\n", $1}' > jobs cat << '_EOF_' > makeWig.sh maf=$1 scale=$2 chromSizes=$3 outRoot=`basename $1 .maf` mafCounts -scale=$scale $maf wigs/$outRoot