3b2dda0ff5910870d3279ffe52d987d5c5a7bb64 hiram Fri Feb 25 10:55:54 2022 -0800 phyloP experiment with SSREV parameter per user request no redmine diff --git src/hg/makeDb/doc/hg38/multiz30way.txt src/hg/makeDb/doc/hg38/multiz30way.txt index fba1544..e8658d6 100644 --- src/hg/makeDb/doc/hg38/multiz30way.txt +++ src/hg/makeDb/doc/hg38/multiz30way.txt @@ -2071,15 +2071,338 @@ ############################################################################ # pushQ readmine (DONE - 2017-11-07 - Hiram) cd /usr/local/apache/htdocs-hgdownload/goldenPath/hg38 find -L `pwd`/multiz30way `pwd`/phastCons30way `pwd`/phyloP30way \ /gbdb/hg38/multiz30way -type f \ > /hive/data/genomes/hg38/bed/multiz30way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/hg38/bed/multiz30way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/hg38/bed/multiz30way/downloads/redmine.20216.fileList cd /hive/data/genomes/hg38/bed/multiz30way/downloads hgsql -e 'show tables;' hg38 | grep 30way \ | sed -e 's/^/hg38./;' > redmine.20216.table.list ############################################################################ +# SSREV model phyloFit for 30-way (DONE - 2021-10-26 - Hiram) + mkdir /hive/data/genomes/hg38/bed/multiz30way/4dSSREV + cd /hive/data/genomes/hg38/bed/multiz30way/4dSSREV + + # re-use the mfa file from the first 4d calculation: + + /hive/data/genomes/hg38/bed/multiz30way/4dSSREV + ln -s ../4d/4d.all.mfa . + ln -s ../4d/treeCommas.nh . + + # use phyloFit to create tree model (output is phyloFit.mod) + time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod SSREV \ + --tree tree-commas.nh 4d.all.mfa + # real 4m51.712s + + mv phyloFit.mod all.SSREV.mod + + grep TREE all.SSREV.mod +# ((((((((((((hg38:0.0101822,panTro5:0.0025629):0.00183633, +# panPan2:0.00255247):0.00567321,gorGor5:0.00857341):0.00933067, +# ponAbe2:0.0183779):0.00329369,nomLeu3:0.022487):0.0111263, +# (((((rheMac8:0.00266035,(macFas5:0.0021818, +# macNem1:0.00424292):0.00180259):0.00606931,cerAty1:0.00671163):0.00176629, +# papAnu3:0.00691103):0.00179762,(chlSab2:0.0163484, +# manLeu1:0.00699997):0.0016666):0.00933071,((nasLar1:0.00767798, +# colAng1:0.0163858):0.00172077,(rhiRox1:0.00212606, +# rhiBie1:0.00222663):0.00577192):0.0104159):0.0214006):0.020624, +# (((calJac3:0.0358326,saiBol1:0.0323928):0.00177708, +# cebCap1:0.0283119):0.00205848,aotNan1:0.0232272):0.0378502):0.060658, +# tarSyr2:0.142136):0.0112106,(((micMur3:0.056364, +# proCoq1:0.0388123):0.00536667,(eulMac1:0.00218455, +# eulFla1:0.00228788):0.0410379):0.0371557, +# otoGar3:0.132659):0.0335083):0.0179418,mm10:0.344669):0.0240895, +# canFam3:0.163925):0.0880641,dasNov3:0.0880641); + + # compare these calculated lengths to what was calculated before: + + grep TREE ../4d/all.mod | sed -e 's/TREE: //;' \ + | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 \ + | sed -e "s/hg38.//;" | sort > original.dists + + grep TREE all.SSREV.mod | sed -e 's/TREE: //;' \ + | /cluster/bin/phast/all_dists /dev/stdin | grep hg38 \ + | sed -e "s/hg38.//;" | sort > SSREV.dists + + # printing out the 'original', the 'new' the 'difference' and + # percent difference/delta + join original.dists SSREV.dists | awk '{ + printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n +# panTro5 0.012747 0.012745 0.000002 0.015692 +# panPan2 0.014424 0.014571 -0.000147 -1.008853 +# gorGor5 0.026112 0.026265 -0.000153 -0.582524 +# ponAbe2 0.045247 0.045400 -0.000153 -0.337004 +# nomLeu3 0.052648 0.052803 -0.000155 -0.293544 +# manLeu1 0.080673 0.080840 -0.000167 -0.206581 +# papAnu3 0.080660 0.080882 -0.000222 -0.274474 +# rhiRox1 0.081014 0.081157 -0.000143 -0.176202 +# rhiBie1 0.081111 0.081257 -0.000146 -0.179677 +# cerAty1 0.082107 0.082449 -0.000342 -0.414802 +# nasLar1 0.082467 0.082658 -0.000191 -0.231073 +# rheMac8 0.084120 0.084467 -0.000347 -0.410811 +# macFas5 0.085357 0.085791 -0.000434 -0.505881 +# macNem1 0.087416 0.087852 -0.000436 -0.496289 +# chlSab2 0.090031 0.090189 -0.000158 -0.175188 +# colAng1 0.091177 0.091365 -0.000188 -0.205768 +# aotNan1 0.122992 0.123144 -0.000152 -0.123433 +# cebCap1 0.130086 0.130287 -0.000201 -0.154275 +# saiBol1 0.135917 0.136145 -0.000228 -0.167469 +# calJac3 0.139357 0.139585 -0.000228 -0.163341 +# eulMac1 0.247615 0.247821 -0.000206 -0.083125 +# eulFla1 0.247716 0.247925 -0.000209 -0.084300 +# proCoq1 0.248499 0.248778 -0.000279 -0.112148 +# tarSyr2 0.264791 0.264860 -0.000069 -0.026051 +# micMur3 0.266045 0.266330 -0.000285 -0.107010 +# otoGar3 0.300022 0.300102 -0.000080 -0.026658 +# canFam3 0.339655 0.339891 -0.000236 -0.069434 +# dasNov3 0.351919 0.352094 -0.000175 -0.049703 +# mm10 0.496188 0.496546 -0.000358 -0.072098 + +######################################################################### +# phyloP SSREV for 30-way (DONE - 2021-10-26 - Hiram) +# + # re-use the split SS files from before + # split SS files into 1M chunks, this business needs smaller files + # to complete + + mkdir /hive/data/genomes/hg38/bed/multiz30way/phyloP.SSREV + cd /hive/data/genomes/hg38/bed/multiz30way/phyloP.SSREV + + # run phyloP with score=LRT + + mkdir run.phyloP + cd run.phyloP + # Adjust model file base composition background and rate matrix to be + # representative of the chromosomes in play + grep BACK ../../4dSSREV/all.SSREV.mod + # BACKGROUND: 0.207173 0.328301 0.237184 0.227343 + + grep BACKGROUND ../../4dSSREV/all.SSREV.mod \ + | awk '{printf "%0.3f\n", $3 + $4}' + # 0.565 + /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ + ../../4dSSREV/all.SSREV.mod 0.565 > all.mod + # verify, the BACKGROUND should now be paired up: + grep BACK all.mod + # BACKGROUND: 0.217500 0.282500 0.282500 0.217500 + + printf '#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set f = $1 +set ssFile = $1:t +set out = $2 +set cName = $f:h +set n = $f:r:e +set grp = $cwd:t +set prevCons = /hive/data/genomes/hg38/bed/multiz30way/consPhyloP +set cons = /hive/data/genomes/hg38/bed/multiz30way/phyloP.SSREV +set tmp = $cons/tmp/$grp/$f +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +set ssSrc = "$prevCons/ss/$cName/$ssFile" +set useGrp = "$grp.mod" +/bin/ln -s $cons/run.phyloP/$grp.mod $tmp +pushd $tmp > /dev/null +$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ + -i SS $useGrp $ssSrc.ss > $ssFile.wigFix +popd > /dev/null +/bin/mkdir -p $out:h +sleep 4 +/bin/touch $out:h +/bin/mv $tmp/$ssFile.wigFix $out +/bin/rm -fr $tmp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp +' > doPhyloP.csh + + chmod +x doPhyloP.csh + + # re-use list of chunks + ln -s ../../consPhyloP/run.phyloP/ss.list . + + # make sure the list looks good + wc -l ss.list + # 3308 ss.list + + # Create template file + # file1 == $chr/$chunk/file name without .ss suffix + printf '#LOOP +../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} +#ENDLOOP +' > template + + ###################### Running all species ####################### + # setup run for all species + mkdir /hive/data/genomes/hg38/bed/multiz30way/phyloP.SSREV/all + cd /hive/data/genomes/hg38/bed/multiz30way/phyloP.SSREV/all + rm -fr wigFix + mkdir wigFix + + gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList + # beware overloading the cluster with these quick and high I/O jobs + para -ram=32g create jobList + para try ... check ... + para push + para time > run.time + +# Completed: 3308 of 3308 jobs +# CPU time in finished jobs: 682943s 11382.39m 189.71h 7.90d 0.022 y +# IO & Wait Time: 22200s 369.99m 6.17h 0.26d 0.001 y +# Average job time: 213s 3.55m 0.06h 0.00d +# Longest finished job: 405s 6.75m 0.11h 0.00d +# Submission to last job: 3562s 59.37m 0.99h 0.04d + + + mkdir downloads + time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` +do + echo "working: $D" 1>&2 + find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/${D}.phyloP30way.wigFix.gz +done + # real 31m52.226s + + du -hsc downloads + # 4.2G downloads + + # check integrity of data with wigToBigWig + time (zcat downloads/*.wigFix.gz \ + | wigToBigWig -verbose=2 stdin /hive/data/genomes/hg38/chrom.sizes \ + phyloP30way.bw) > bigWig.log 2>&1 +XXX - working - Tue Oct 26 12:43:42 PDT 2021 + + + egrep "real|VmPeak" bigWig.log + # pid=78042: VmPeak: 33938800 kB + # real 34m4.165s + + bigWigInfo phyloP30way.bw | sed -e 's/^/# /;' +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 5,762,987,667 +# primaryIndexSize: 93,404,704 +# zoomLevels: 10 +# chromCount: 355 +# basesCovered: 2,955,660,581 +# mean: 0.100336 +# min: -20.000000 +# max: 1.226000 +# std: 0.723021 + + # encode those files into wiggle data + time (zcat downloads/*.wigFix.gz \ + | wigEncode stdin phyloPSSREV30way.wig phyloPSSREV30way.wib) + +# Converted stdin, upper limit 1.23, lower limit -20.00 +# real 13m31.412s +# -rw-rw-r-- 1 2955660581 Oct 27 12:23 phyloPSSREV30way.wib +# -rw-rw-r-- 1 318831430 Oct 27 12:23 phyloPSSREV30way.wig + + du -hsc *.wi? + # 2.8G phyloP30way.wib + # 305M phyloP30way.wig + + # Load gbdb and database with wiggle. + ln -s `pwd`/phyloPSSREV30way.wib /gbdb/hg38/multiz30way/phyloPSSREV30way.wib + time hgLoadWiggle -pathPrefix=/gbdb/hg38/multiz30way hg38 \ + phyloPSSREV30way phyloPSSREV30way.wig + # real 0m18.359s + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh hg38 phyloPSSREV30way +# db.table min max mean count sumData +# hg38.phyloPSSREV30way -20 1.226 0.100336 2955660581 2.96558e+08 +# stdDev viewLimits +# 0.723021 viewLimits=-3.51477:1.226 +# hg38.phyloP30way -20 1.312 0.0978331 2955660581 2.89162e+08 +# 0.727453 viewLimits=-3.53943:1.312 + + # that range is: 20+1.226= 21.226 for hBinSize=0.021226 + + # Create histogram to get an overview of all the data + time hgWiggle -doHistogram \ + -hBinSize=0.021226 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ + -db=hg38 phyloPSSREV30way > histogram.data 2>&1 + # real 1m54.621s + + # xaxis range: + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ + | sed -e 's/^/# /;' +# Q1 -10.947100 +# median -6.892945 +# Q3 -2.838780 +# average -6.908920 +# min -20.000000 +# max 1.204770 +# count 764 +# total -5278.414760 +# standard deviation 4.715614 + +# Q1 -10.953050 +# median -6.861155 +# Q3 -2.769245 +# average -6.875971 +# min -20.000000 +# max 1.312000 +# count 768 +# total -5280.745380 +# standard deviation 4.757034 + + # find out the range for the 2:5 graph + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ + | sed -e 's/^/# /;' +# Q1 0.000000 +# median 0.000001 +# Q3 0.000126 +# average 0.001309 +# min 0.000000 +# max 0.022808 +# count 764 +# total 0.999990 +# standard deviation 0.003612 + +# Q1 0.000000 +# median 0.000001 +# Q3 0.000140 +# average 0.001302 +# min 0.000000 +# max 0.023556 +# count 768 +# total 0.999975 +# standard deviation 0.003490 + + # create plot of histogram: + printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ + + printf 'set terminal pngcairo size 900,400 background "#000000" \\ +font "/usr/share/fonts/default/Type1/n022004l.pfb" +set size 1.4, 0.8 +set key left box +set grid noxtics +set grid ytics +set title " Human hg38 Histogram phyloPSSREV30way track" +set xlabel " phyloPSSREV30way score" +set ylabel " Relative Frequency" +set y2label " Cumulative Relative Frequency (CRF)" +set y2range [0:1] +set y2tics +set xrange [-5:1.5] +set yrange [0:0.04] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines +' | gnuplot > histo.png + + # verify it looks sane + display histo.png & + +#############################################################################