be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/makeDb/doc/mm39/multiz35way.txt src/hg/makeDb/doc/mm39/multiz35way.txt new file mode 100644 index 0000000..9cd7aa7 --- /dev/null +++ src/hg/makeDb/doc/mm39/multiz35way.txt @@ -0,0 +1,2015 @@ +############################################################################# +## 35-Way Multiz (TBD - 2020-12-16 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/mm39/bed/multiz35way + cd /hive/data/genomes/mm39/bed/multiz35way + + # from the 220-way in the source tree, select out the 35 used here: + /cluster/bin/phast/tree_doctor \ + --prune-all-but bosTau9,calJac4,canFam4,criGri1,casCan1,cavPor3,danRer11,echTel2,equCab3,eriEur2,galGal6,galVar1,gorGor6,hg38,loxAfr3,manPen1,mm39,monDom5,neoSch1,ochPri3,oryCun2,otoGar3,oviAri4,panPan3,panTro6,petMar3,rheMac10,rn6,sorAra2,speTri2,susScr3,tarSyr2,tupBel1,turTru2,xenTro9 \ + /cluster/home/hiram/kent/src/hg/utils/phyloTrees/220way.nh \ + | sed -e 's/criGri1/GCF_003668045.3/;' \ + > t0.nh + + # using TreeGraph2 tree editor on the Mac, rearrange to get mm39 + # at the top, and attempt to get the others in phylo order: + /cluster/bin/phast/all_dists t.nh | grep mm39 \ + | sed -e "s/mm39.//" | sort -k2n | sed -e 's/^/#\t/;' +# rn6 0.186098 +# GCF_003668045.3 0.332282 +# casCan1 0.412432 +# speTri2 0.417900 +# manPen1 0.485701 +# cavPor3 0.491203 +# galVar1 0.493420 +# calJac4 0.494237 +# gorGor6 0.500585 +# hg38 0.502391 +# panPan3 0.502461 +# panTro6 0.502681 +# tarSyr2 0.503897 +# rheMac10 0.510018 +# otoGar3 0.511897 +# equCab3 0.520098 +# turTru2 0.540150 +# canFam4 0.543004 +# tupBel1 0.549623 +# susScr3 0.549974 +# loxAfr3 0.556386 +# oryCun2 0.556860 +# oviAri4 0.574054 +# neoSch1 0.585546 +# bosTau9 0.599054 +# ochPri3 0.643702 +# eriEur2 0.676481 +# echTel2 0.703393 +# sorAra2 0.724258 +# monDom5 0.921254 +# xenTro9 1.214580 +# galGal6 1.326078 +# petMar3 2.253737 +# danRer11 2.279062 + + # what that looks like: +~/kent/src/hg/utils/phyloTrees/asciiTree.pl t.nh > mm39.35way.nh +~/kent/src/hg/utils/phyloTrees/asciiTree.pl mm39.35way.nh | sed -e 's/^/# /;' + +# ((((((((((((((mm39:0.089509, +# rn6:0.096589):0.072773, +# GCF_003668045.3:0.17):0.08015, +# casCan1:0.17):0.05, +# speTri2:0.125468):0.022992, +# cavPor3:0.175779):0.025746, +# (oryCun2:0.114227, +# ochPri3:0.201069):0.101463):0.015313, +# ((((((((hg38:0.00655, +# panTro6:0.00684):0.00122, +# panPan3:0.00784):0.003, +# gorGor6:0.008964):0.025204, +# rheMac10:0.043601):0.02183, +# calJac4:0.04965):0.05209, +# tarSyr2:0.1114):0.02052, +# otoGar3:0.13992):0.013494, +# (tupBel1:0.136203, +# galVar1:0.08):0.054937):0.002):0.020593, +# (((susScr3:0.12, +# (turTru2:0.064688, +# (bosTau9:0.11, +# oviAri4:0.085):0.013592):0.045488):0.02, +# ((equCab3:0.084397, +# manPen1:0.05):0.017, +# (canFam4:0.054458, +# neoSch1:0.097):0.069845):0.008727):0.011671, +# (eriEur2:0.221785, +# sorAra2:0.269562):0.056393):0.021227):0.023664, +# (loxAfr3:0.098929, +# echTel2:0.245936):0.056717):0.234728, +# monDom5:0.285786):0.181168, +# galGal6:0.509442):0.05, +# xenTro9:0.347944):0.211354, +# danRer11:1.201072):0.2, +# petMar3:0.975747); + + # extract species list from that .nh file + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + mm39.35way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ + | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt + + # construct db to name sed translation list: + + cat species.list.txt | while read DB +do +hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest +done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \ + | sed -e "s#'#_x_#g;" > db.to.name.sed + + printf "s/GCF_003668045.3/Chinese_hamster/;\n" >> db.to.name.sed + + sed -f db.to.name.sed mm39.35way.nh | sed -e "s#_x_#'#g; s#X__#X._#;"\ + > mm39.35way.commonNames.nh + + cat mm39.35way.commonNames.nh | sed -e 's/^/# /;' +# ((((((((((((((Mouse:0.089509, +# Rat:0.096589):0.072773, +# Chinese_hamster:0.17):0.08015, +# Beaver:0.17):0.05, +# Squirrel:0.125468):0.022992, +# Guinea_pig:0.175779):0.025746, +# (Rabbit:0.114227, +# Pika:0.201069):0.101463):0.015313, +# ((((((((Human:0.00655, +# Chimp:0.00684):0.00122, +# Bonobo:0.00784):0.003, +# Gorilla:0.008964):0.025204, +# Rhesus:0.043601):0.02183, +# Marmoset:0.04965):0.05209, +# Tarsier:0.1114):0.02052, +# Bushbaby:0.13992):0.013494, +# (Tree_shrew:0.136203, +# Malayan_flying_lemur:0.08):0.054937):0.002):0.020593, +# (((Pig:0.12, +# (Dolphin:0.064688, +# (Cow:0.11, +# Sheep:0.085):0.013592):0.045488):0.02, +# ((Horse:0.084397, +# Chinese_pangolin:0.05):0.017, +# (Dog:0.054458, +# Hawaiian_monk_seal:0.097):0.069845):0.008727):0.011671, +# (Hedgehog:0.221785, +# Shrew:0.269562):0.056393):0.021227):0.023664, +# (Elephant:0.098929, +# Tenrec:0.245936):0.056717):0.234728, +# Opossum:0.285786):0.181168, +# Chicken:0.509442):0.05, +# X._tropicalis:0.347944):0.211354, +# Zebrafish:1.201072):0.2, +# Lamprey:0.975747); + +# Use this specification in the phyloGif tool: +# http://genome.ucsc.edu/cgi-bin/phyloGif +# to obtain a png image for src/hg/htdocs/images/phylo/mm39_35way.png + + cat species.list.txt | while read DB +do +hgsql -N -e "select name,scientificName from dbDb where name=\"${DB}\";" hgcentraltest +done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \ + | sed -e "s#'#_x_#g;" > db.to.sciName.sed + + printf "s/GCF_003668045.3/Cricetulus_griseus/;\n" >> db.to.sciName.sed + + sed -f db.to.sciName.sed mm39.35way.nh > mm39.35way.scientificNames.nh + + > mm39.35way.commonNames.nh + + | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ + > mm39.35way.scientificNames.nh + cat mm39.35way.scientificNames.nh | sed -e 's/^/# /;' +# ((((((((((((((Mus_musculus:0.089509, +# Rattus_norvegicus:0.096589):0.072773, +# Cricetulus_griseus:0.17):0.08015, +# Castor_canadensis:0.17):0.05, +# Spermophilus_tridecemlineatus:0.125468):0.022992, +# Cavia_porcellus:0.175779):0.025746, +# (Oryctolagus_cuniculus:0.114227, +# Ochotona_princeps:0.201069):0.101463):0.015313, +# ((((((((Homo_sapiens:0.00655, +# Pan_troglodytes:0.00684):0.00122, +# Pan_paniscus:0.00784):0.003, +# Gorilla_gorilla_gorilla:0.008964):0.025204, +# Macaca_mulatta:0.043601):0.02183, +# Callithrix_jacchus:0.04965):0.05209, +# Tarsius_syrichta:0.1114):0.02052, +# Otolemur_garnettii:0.13992):0.013494, +# (Tupaia_belangeri:0.136203, +# Galeopterus_variegatus:0.08):0.054937):0.002):0.020593, +# (((Sus_scrofa:0.12, +# (Tursiops_truncatus:0.064688, +# (Bos_taurus:0.11, +# Ovis_aries:0.085):0.013592):0.045488):0.02, +# ((Equus_caballus:0.084397, +# Manis_pentadactyla:0.05):0.017, +# (Canis_lupus_familiaris:0.054458, +# Neomonachus_schauinslandi:0.097):0.069845):0.008727):0.011671, +# (Erinaceus_europaeus:0.221785, +# Sorex_araneus:0.269562):0.056393):0.021227):0.023664, +# (Loxodonta_africana:0.098929, +# Echinops_telfairi:0.245936):0.056717):0.234728, +# Monodelphis_domestica:0.285786):0.181168, +# Gallus_gallus:0.509442):0.05, +# Xenopus_tropicalis:0.347944):0.211354, +# Danio_rerio:1.201072):0.2, +# Petromyzon_marinus:0.975747); + + /cluster/bin/phast/all_dists mm39.35way.nh | grep mm39 \ + | sed -e "s/mm39.//" | sort -k2n > 35way.distances.txt + # Use this output to create the table below + cat 35way.distances.txt | sed -e 's/^/# /;' +# rn6 0.186098 +# GCF_003668045.3 0.332282 +# casCan1 0.412432 +# speTri2 0.417900 +# manPen1 0.485701 +# cavPor3 0.491203 +# galVar1 0.493420 +# calJac4 0.494237 +# gorGor6 0.500585 +# hg38 0.502391 +# panPan3 0.502461 +# panTro6 0.502681 +# tarSyr2 0.503897 +# rheMac10 0.510018 +# otoGar3 0.511897 +# equCab3 0.520098 +# turTru2 0.540150 +# canFam4 0.543004 +# tupBel1 0.549623 +# susScr3 0.549974 +# loxAfr3 0.556386 +# oryCun2 0.556860 +# oviAri4 0.574054 +# neoSch1 0.585546 +# bosTau9 0.599054 +# ochPri3 0.643702 +# eriEur2 0.676481 +# echTel2 0.703393 +# sorAra2 0.724258 +# monDom5 0.921254 +# xenTro9 1.214580 +# galGal6 1.326078 +# petMar3 2.253737 +# danRer11 2.279062 + + ~/kent/src/hg/makeDb/doc/mm39/sizeStats.pl + +# If you can fill in all the numbers in this table, you are ready for +# the multiple alignment procedure +# count phylo chain synNet rBest synNet v. query +# dist link chain +# 001 0.1861 (% 70.901) (% 66.256) (% 65.494) 6.55 - Rat rn6 +# 002 0.3323 (% 58.000) (% 54.406) (% 00.000) 6.20 - Cricetulus griseus GCF_003668045.3 +# 003 0.4124 (% 36.600) (% 32.404) (% 34.407) 11.46 - Beaver casCan1 +# 004 0.4179 (% 34.147) (% 31.953) (% 32.373) 6.43 - Squirrel speTri2 +# 005 0.4857 (% 27.322) (% 22.622) (% 25.847) 17.20 - Chinese pangolin manPen1 +# 006 0.4912 (% 28.378) (% 26.311) (% 26.932) 7.28 - Guinea pig cavPor3 +# 007 0.4934 (% 35.714) (% 31.386) (% 33.751) 12.12 - Malayan flying lemur galVar1 +# 008 0.4942 (% 33.090) (% 31.297) (% 31.492) 5.42 - Marmoset calJac4 +# 009 0.5006 (% 35.064) (% 33.225) (% 33.360) 5.24 - Gorilla gorGor6 +# 010 0.5024 (% 35.372) (% 33.566) (% 33.646) 5.11 - Human hg38 +# 011 0.5025 (% 35.282) (% 33.492) (% 33.579) 5.07 - Bonobo panPan3 +# 012 0.5027 (% 35.291) (% 33.511) (% 33.608) 5.04 - Chimp panTro6 +# 013 0.5039 (% 32.278) (% 28.908) (% 30.629) 10.44 - Tarsier tarSyr2 +# 014 0.5100 (% 34.838) (% 33.095) (% 33.167) 5.00 - Rhesus rheMac10 +# 015 0.5119 (% 29.729) (% 27.878) (% 28.263) 6.23 - Bushbaby otoGar3 +# 016 0.5201 (% 34.831) (% 33.041) (% 33.077) 5.14 - Horse equCab3 +# 017 0.5402 (% 30.188) (% 24.901) (% 28.769) 17.51 - Dolphin turTru2 +# 018 0.5430 (% 29.376) (% 27.756) (% 27.951) 5.51 - Dog canFam4 +# 019 0.5496 (% 19.691) (% 12.280) (% 18.365) 37.64 - Tree shrew tupBel1 +# 020 0.5500 (% 25.611) (% 23.252) (% 24.292) 9.21 - Pig susScr3 +# 021 0.5564 (% 25.756) (% 23.827) (% 24.416) 7.49 - Elephant loxAfr3 +# 022 0.5569 (% 25.182) (% 23.079) (% 23.777) 8.35 - Rabbit oryCun2 +# 023 0.5741 (% 26.200) (% 24.360) (% 24.783) 7.02 - Sheep oviAri4 +# 024 0.5855 (% 31.257) (% 29.478) (% 29.730) 5.69 - Hawaiian monk seal neoSch1 +# 025 0.5991 (% 26.588) (% 24.846) (% 25.188) 6.55 - Cow bosTau9 +# 026 0.6437 (% 18.559) (% 16.419) (% 17.355) 11.53 - Pika ochPri3 +# 027 0.6765 (% 13.450) (% 09.501) (% 12.323) 29.36 - Hedgehog eriEur2 +# 028 0.7034 (% 14.456) (% 11.161) (% 13.329) 22.79 - Tenrec echTel2 +# 029 0.7243 (% 13.370) (% 10.430) (% 12.301) 21.99 - Shrew sorAra2 +# 030 0.9213 (% 05.382) (% 04.390) (% 04.627) 18.43 - Opossum monDom5 +# 031 1.2146 (% 01.936) (% 00.783) (% 01.354) 59.56 - X. tropicalis xenTro9 +# 032 1.3261 (% 02.537) (% 01.835) (% 02.015) 27.67 - Chicken galGal6 +# 033 2.2537 (% 00.905) (% 00.027) (% 00.517) 97.02 - Lamprey petMar3 +# 034 2.2791 (% 01.416) (% 00.185) (% 00.947) 86.94 - Zebrafish danRer11 +# count phylo chain synNet rBest synNet v. query +# dist link chain + +# None of this concern for distances matters in building the first step, the +# maf files. The distances will be better calibrated later. + + # create species list and stripped down tree for autoMZ + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + mm39.35way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh + + sed 's/[()]//g; s/,/ /g' tree.nh > species.list + cat species.list | fold -s -w 76 | sed -e 's/^/# /;' +# mm39 rn6 GCF_003668045.3 casCan1 speTri2 cavPor3 oryCun2 ochPri3 hg38 +# panTro6 panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 tupBel1 galVar1 +# susScr3 turTru2 bosTau9 oviAri4 equCab3 manPen1 canFam4 neoSch1 eriEur2 +# sorAra2 loxAfr3 echTel2 monDom5 galGal6 xenTro9 danRer11 petMar3 + + # bash shell syntax here ... + cd /hive/data/genomes/mm39/bed/multiz35way + + export H=/hive/data/genomes/mm39/bed + mkdir mafLinks + # good, phylo close assemblies can use syntenic net: + for G in rn6 cavPor3 calJac4 gorGor6 hg38 panPan3 panTro6 rheMac10 \ + equCab3 canFam4 oviAri4 neoSch1 + do + cd $H/multiz35way + rm -fr mafLinks/$G + mkdir mafLinks/$G + cd mafLinks/$G + mafSplit -outDirDepth=0 -byTarget -useFullSequenceName \ + /dev/null . ${H}/lastz.$G/axtChain/mm39.${G}.synNet.maf.gz + gzip *.maf + echo lastz.$G/axtChain/mm39.${G}.synNet.maf.gz + done + + # GCF_003668045.3 is special and was done with a name translation step + # to eliminate dots in the assembly name and in the sequence names + # maf files expect only one dot on the s lines: assembly.sequence + mkdir mafLinks/GCF_003668045v3 + cd mafLinks/GCF_003668045v3 + mafSplit -outDirDepth=0 -byTarget -useFullSequenceName \ + /dev/null . ../../translated/GCF_003668045v3.maf.gz + gzip *.maf + + # assemblies using recip best net: + for G in casCan1 speTri2 manPen1 tarSyr2 otoGar3 turTru2 tupBel1 \ + susScr3 loxAfr3 oryCun2 bosTau9 ochPri3 eriEur2 echTel2 sorAra2 + do + cd $H/multiz35way + rm -fr mafLinks/$G + mkdir mafLinks/$G + cd mafLinks/$G + echo ln -s "lastz.$G/mafRBestNet/"'*' ./mafLinks/$G + ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./ + done + + # and finally, assemblies using the base mafNet + for G in galVar1 monDom5 xenTro9 galGal6 danRer11 petMar3 + do + cd $H/multiz35way + rm -fr mafLinks/$G + mkdir mafLinks/$G + cd mafLinks/$G + echo ln -s "lastz.$G/mafNet/"'*' ./mafLinks/$G + ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./ + done + # verify the symLinks are good, should be 22 primary chromosomes for all: + for D in `ls -d mafLinks/*` + do + printf "%02d\t%s\n" "`ls $D | egrep -v "chrUn|random" | wc -l`" "${D}" + done | sed -e 's/^/#\t/;' +# 22 mafLinks/GCF_003668045.3 +# 22 mafLinks/bosTau9 +# 22 mafLinks/calJac4 +... +# 22 mafLinks/tupBel1 +# 22 mafLinks/turTru2 +# 22 mafLinks/xenTro9 + + # verify the symLinks are good, various other numbers with all chromosomes: + for D in `ls -d mafLinks/*` + do + printf "%02d\t%s\n" "`ls $D | wc -l`" "${D}" + done | sed -e 's/^/#\t/;' | sort -k2,2n +# 25 mafLinks/cavPor3 +# 25 mafLinks/oviAri4 +# 27 mafLinks/canFam4 +... +# 47 mafLinks/speTri2 +# 49 mafLinks/rn6 +# 54 mafLinks/galVar1 + + # Interesting that there are no matches to all the chromosomes by + # any organism. Let's see how many if all are placed together + for D in `ls -d mafLinks/*` + do + ls $D | grep chr | sed -e 's#.*/##;' + done | sort -u | wc -l +# 54 + # seems to be the limit, wonder what is missing: + find ./mafLinks -type f | sed -e 's#.*/##;' \ + | sed -e 's/.maf.gz//;' | sort -u > maf.list.here + comm -23 <(cut -f1 ../../chrom.sizes | sort ) maf.list.here | sed -e 's/^/#\t/;' +# chr1_GL456239v1_random +# chr4_JH584295v1_random +# chrUn_GL456368v1 +# chrUn_GL456370v1 +# chrUn_GL456383v1 +# chrUn_GL456389v1 +# chrUn_GL456390v1 +# chrUn_GL456392v1 +# chrUn_GL456396v1 +# chrUn_MU069435v1 + + # and the sizes of those missed chroms: + comm -23 <(cut -f1 ../../chrom.sizes | sort ) maf.list.here \ +| while read S; do grep "${S}" ../../chrom.sizes; done | sed -e 's/^/#\t/;' +# chr1_GL456239v1_random 40056 +# chr4_JH584295v1_random 1976 +# chrUn_GL456368v1 20208 +# chrUn_GL456370v1 26764 +# chrUn_GL456383v1 38659 +# chrUn_GL456389v1 28772 +# chrUn_GL456390v1 24668 +# chrUn_GL456392v1 23629 +# chrUn_GL456396v1 21240 +# chrUn_MU069435v1 31129 + + # try to do these as full chromosomes without the splitting procedure + # they will work, they just take a really long time for the big chroms + + mkdir /hive/data/genomes/mm39/bed/multiz35way/fullChromRun + cd /hive/data/genomes/mm39/bed/multiz35way/fullChromRun + mkdir maf run + cd run + mkdir penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn + cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn + + find ../../mafLinks -type l | grep ".maf.gz$" | xargs -L 1 basename \ + | sed -e 's/.gz//;' | sort -u > maf.list + + wc -l maf.list + # 54 maf.list + + # set the db and pairs directories here + cat > autoMultiz.csh << '_EOF_' +#!/bin/csh -ef +set db = mm39 +set c = $1 +set result = $2 +set run = `/bin/pwd` +set tmp = /dev/shm/$db/multiz.$c +set pairs = /hive/data/genomes/mm39/bed/multiz35way/mafLinks +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +/usr/bin/sed -e 's/GCF_003668045.3/GCF_003668045v3/;' ../../tree.nh > $tmp/tree.nh +/usr/bin/sed -e 's/GCF_003668045.3/GCF_003668045v3/;' ../../species.list > $tmp/species.list +pushd $tmp > /dev/null +foreach s (`/bin/sed -e "s/$db //" species.list`) + set in = $pairs/$s/$c + set out = $db.$s.sing.maf + if (-e $in.gz) then + /bin/zcat $in.gz > $out + if (! -s $out) then + echo "##maf version=1 scoring=autoMZ" > $out + endif + else if (-e $in) then + /bin/ln -s $in $out + else + echo "##maf version=1 scoring=autoMZ" > $out + endif +end +set path = ($run/penn $path); rehash +$run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ + > /dev/null +popd > /dev/null +/bin/rm -f $result +/bin/cp -p $tmp/$c $result +/bin/rm -fr $tmp +'_EOF_' +# << happy emacs + chmod +x autoMultiz.csh + + printf '#LOOP +./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/mm39/bed/multiz35way/fullChromRun/maf/$(root1).maf} +#ENDLOOP +' > template + + ssh ku + cd /hive/data/genomes/mm39/bed/multiz35way/fullChromRun/run + gensub2 maf.list single template jobList + para -ram=64g create jobList +# Completed: 54 of 54 jobs +# CPU time in finished jobs: 1548419s 25806.98m 430.12h 17.92d 0.049 y +# IO & Wait Time: 772s 12.87m 0.21h 0.01d 0.000 y +# Average job time: 28689s 478.15m 7.97h 0.33d +# Longest finished job: 125726s 2095.43m 34.92h 1.46d +# Submission to last job: 125739s 2095.65m 34.93h 1.46d + +####################################################################### +# GAP ANNOTATE MULTIZ30WAY MAF AND LOAD TABLES (TBD - 2017-11-03 - Hiram) + # mafAddIRows has to be run on single chromosome maf files, it does not + # function correctly when more than one reference sequence + # are in a single file. + mkdir -p /hive/data/genomes/mm39/bed/multiz35way/anno + cd /hive/data/genomes/mm39/bed/multiz35way/anno + + # check for N.bed files everywhere: + for DB in `sed -e 's/ GCF_003668045.3//;' ../species.list` +do + if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then + echo "MISS: ${DB}" +# cd /hive/data/genomes/${DB} +# twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed + else + echo " OK: ${DB}" + fi + cd /hive/data/genomes/mm39/bed/multiz35way/anno +done + twoBitInfo -nBed \ +/hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.2bit \ +stdout | sed -e 's/\./v/g;' > GCF_003668045v3.bed + sed -e 's/\./v/g;' /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes \ + > GCF_003668045v3.len + + cd /hive/data/genomes/mm39/bed/multiz35way/anno + for DB in `sed -e 's/ GCF_003668045.3//;' ../species.list` +do + echo "${DB} " + ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed + ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len +done + ls *.bed > nBeds + ls *.len > sizes + # make sure they all are successful symLinks: + ls -ogrtL *.bed | wc -l + # 35 + ls -ogrtL *.len | wc -l + # 35 + wc -l nBeds sizes + # 35 nBeds + # 35 sizes + + screen -S mm39 # use a screen to control this longish job + # do not need to go to ku for this, can run on hgwdev parasol + cd /hive/data/genomes/mm39/bed/multiz35way/anno + mkdir result + + printf '#LOOP +mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/mm39/mm39.2bit {check out line+ result/$(file1)} +#ENDLOOP +' > template + + # tac to get some of the smaller ones to run first so it is easy to verify + # it is running OK + ls ../fullChromRun/maf/*.maf | tac > maf.list + + gensub2 maf.list single template jobList + # good to allow plenty of memory, slows them down and allows the large ones + # to finish + para -ram=64g create jobList + para try ... check ... + para push +# Completed: 54 of 54 jobs +# CPU time in finished jobs: 4649s 77.48m 1.29h 0.05d 0.000 y +# IO & Wait Time: 346s 5.77m 0.10h 0.00d 0.000 y +# Average job time: 93s 1.54m 0.03h 0.00d +# Longest finished job: 382s 6.37m 0.11h 0.00d +# Submission to last job: 1432s 23.87m 0.40h 0.02d + + du -hsc result + # 141G result + + # translate those results back to the GCF hub names: + cd /hive/data/genomes/mm39/bed/multiz35way + mkdir maf + + printf '#LOOP +reverseTrans.sh $(path1) {check out exists+ maf/$(path1).maf} +#ENDLOOP +' > template + + printf '#!/bin/bash + +set -beEu -o pipefail + +export C=$1 +export R=$2 + +~/kent/src/hg/makeDb/mm39/reverseTranslate.pl /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes GCF_003668045.3 anno/result/${C}.maf > ${R} +' > reverseTrans.sh + chmod +x reverseTrans.sh + + ls anno/result | sed -e 's/.maf//;' > chr.result.list + + gensub2 chr.result.list single template reverseTrans.jobList + para create reverseTrans.jobList + para push +# Completed: 54 of 54 jobs +# CPU time in finished jobs: 1784s 29.74m 0.50h 0.02d 0.000 y +# IO & Wait Time: 1880s 31.33m 0.52h 0.02d 0.000 y +# Average job time: 68s 1.13m 0.02h 0.00d +# Longest finished job: 245s 4.08m 0.07h 0.00d +# Submission to last job: 285s 4.75m 0.08h 0.00d + + # Load into database + rm -f /gbdb/mm39/multiz35way/maf/* + cd /hive/data/genomes/mm39/bed/multiz35way/maf + + ln -s `pwd`/*.maf /gbdb/mm39/multiz35way/maf/ + + # this generates an immense multiz35way.tab file in the directory + # where it is running. Best to run this over in scratch. + cd /dev/shm + time hgLoadMaf -pathPrefix=/gbdb/mm39/multiz35way/maf mm39 multiz35way + # Loaded 40655883 mafs in 358 files from /gbdb/mm39/multiz35way/maf + # real 37m27.075s + + # -rw-rw-r-- 1 2177747201 Nov 2 18:27 multiz35way.tab + + time (cat /gbdb/mm39/multiz35way/maf/*.maf \ + | hgLoadMafSummary -verbose=2 -minSize=30000 \ + -mergeGap=1500 -maxSize=200000 mm39 multiz35waySummary stdin) +# Created 4568973 summary blocks from 850984320 components and 40655883 mafs from stdin +# real 59m27.383s + +# -rw-rw-r-- 1 2177747201 Nov 2 18:27 multiz35way.tab +# -rw-rw-r-- 1 224894681 Nov 3 08:12 multiz35waySummary.tab + + wc -l multiz35way*.tab + # 40655883 multiz35way.tab + # 4568973 multiz35waySummary.tab + + rm multiz35way*.tab + +############################################################################## +# MULTIZ7WAY MAF FRAMES (DONE - 2020-12-22 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/mm39/bed/multiz35way/frames + cd /hive/data/genomes/mm39/bed/multiz35way/frames +# survey all the genomes to find out what kinds of gene tracks they have + + printf '#!/bin/csh -fe +foreach db (`sed -e "s/GCF_003668045.3 //;" ../species.list`) + echo -n "# ${db}: " + set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` + foreach table ($tables) + if ($table == "ensGene" || $table == "refGene" || \ + $table == "ncbiRefSeq" || $table == "mgcGenes" || \ + $table == "knownGene" || $table == "xenoRefGene" ) then + set count = `hgsql $db -N -e "select count(*) from $table"` + echo -n "${table}: ${count}, " + endif + end + echo +end +' > showGenes.csh + + chmod +x ./showGenes.csh + time ./showGenes.csh +# mm39: ncbiRefSeq: 130343, refGene: 47741, xenoRefGene: 197343, +# rn6: ensGene: 41078, mgcGenes: 7013, ncbiRefSeq: 69456, refGene: 19160, xenoRefGene: 223156, +# casCan1: ensGene: 38514, ncbiRefSeq: 40013, xenoRefGene: 362113, +# speTri2: ensGene: 33336, ncbiRefSeq: 41886, xenoRefGene: 446239, +# cavPor3: ensGene: 34846, ncbiRefSeq: 44775, refGene: 491, xenoRefGene: 337367 +# oryCun2: ensGene: 51853, ncbiRefSeq: 43655, refGene: 1750, xenoRefGene: 345704 +# ochPri3: ncbiRefSeq: 26253, xenoRefGene: 313971, +# hg38: ensGene: 208239, knownGene: 229647, mgcGenes: 36638, ncbiRefSeq: 170769, refGene: 88819, xenoRefGene: 200365, +# panTro6: ncbiRefSeq: 102471, refGene: 2877, xenoRefGene: 245176, +# panPan3: ncbiRefSeq: 85047, refGene: 218, xenoRefGene: 246746, +# gorGor6: ncbiRefSeq: 61950, refGene: 449, xenoRefGene: 333640, +# rheMac10: ensGene: 64228, ncbiRefSeq: 86732, refGene: 6482, xenoRefGene: 243139, +# calJac4: ncbiRefSeq: 107273, refGene: 237, xenoRefGene: 256337, +# tarSyr2: ensGene: 38314, ncbiRefSeq: 35799, xenoRefGene: 375108, +# otoGar3: ensGene: 28565, ncbiRefSeq: 38532, xenoRefGene: 530019, +# tupBel1: ensGene: 34727, xenoRefGene: 845862, +# galVar1: ncbiRefSeq: 34747, xenoRefGene: 568133, +# susScr3: ensGene: 30585, ncbiRefSeq: 66084, refGene: 5345, xenoRefGene: 477106, +# turTru2: xenoRefGene: 583163, +# bosTau9: ensGene: 43984, ncbiRefSeq: 76369, refGene: 14599, xenoRefGene: 221423, +# oviAri4: ncbiRefSeq: 49730, refGene: 1026, xenoRefGene: 230226, +# equCab3: ensGene: 59087, ncbiRefSeq: 76580, refGene: 1966, xenoRefGene: 311382, +# manPen1: xenoRefGene: 548155, +# canFam4: refGene: 2382, xenoRefGene: 238226, +# neoSch1: ncbiRefSeq: 29897, xenoRefGene: 446526, +# eriEur2: ncbiRefSeq: 29936, xenoRefGene: 315055, +# sorAra2: ncbiRefSeq: 23525, xenoRefGene: 469604, +# loxAfr3: ensGene: 28847, ncbiRefSeq: 46056, refGene: 23, xenoRefGene: 359460, +# echTel2: ncbiRefSeq: 23075, xenoRefGene: 470748, +# monDom5: ensGene: 32358, refGene: 1239, xenoRefGene: 256483, +# galGal6: ensGene: 39288, ncbiRefSeq: 62170, refGene: 7493, xenoRefGene: 151778, +# xenTro9: ensGene: 56323, ncbiRefSeq: 43724, refGene: 8806, xenoRefGene: 164095, +# danRer11: ensGene: 59876, ncbiRefSeq: 65219, refGene: 18968, xenoRefGene: 164771, +# petMar3: xenoRefGene: 200903, + + # and for the hub assembly: GCF_003668045.3_CriGri-PICRH-1.0 + # wc -l trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp + # 61048 trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp + + # from that summary, use these gene sets: + # knownGene - hg38 + # ensGene - tupBel1 monDom5 + # ncbiRefSeq - mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6 +panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9 +oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9 +danRer11 + # xenoRefGene - turTru2 manPen1 canFam4 petMar3 + + + mkdir genes + + # 1. knownGene: hg38 + for DB in hg38 +do + hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > genes/${DB}.gp.gz + genePredCheck -db=${DB} genes/${DB}.gp.gz 2>&1 | sed -e 's/^/ # /;' +done + # checked: 19327 failed: 0 + + # 2. ensGene: tupBel1 monDom5 + for DB in tupBel1 monDom5 +do +hgsql -N -e "select +name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds +from ensGene" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /dev/shm/${DB}.tmp.gz + mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz + echo -n "# ${DB}: " + genePredCheck -db=${DB} genes/${DB}.gp.gz +done +# tupBel1: checked: 29256 failed: 0 +# monDom5: checked: 21033 failed: 0 + + # ncbiRefSeq for the hub: + cut -f1 /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes | while read C +do + printf "s/%s/%s/g;\n" "${C}" "`echo $C | sed -e 's/\./v/;'`" +done > GCF.name.trans.sed + + genePredSingleCover /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/trackData/ncbiRefSeq/process/GCF_003668045.3_CriGri-PICRH-1.0.gp stdout | sed -f GCF.name.trans.sed | gzip -2c > genes/GCF_003668045v3.gp.gz + + sed -e 's/\./v/;' /hive/data/genomes/asmHubs/refseqBuild/GCF/003/668/045/GCF_003668045.3_CriGri-PICRH-1.0/GCF_003668045.3_CriGri-PICRH-1.0.chrom.sizes \ + | genePredCheck -chromSizes=stdin genes/GCF_003668045v3.gp.gz + # checked: 22354 failed: 0 + + # 3. ncbiRefSeq - mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6 +panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9 +oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9 +danRer11 + for DB in mm39 rn6 casCan1 speTri2 cavPor3 oryCun2 ochPri3 panTro6 \ +panPan3 gorGor6 rheMac10 calJac4 tarSyr2 otoGar3 galVar1 susScr3 bosTau9 \ +oviAri4 equCab3 neoSch1 eriEur2 sorAra2 loxAfr3 echTel2 galGal6 xenTro9 \ +danRer11 +do +hgsql -N -e "select +name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds +from ncbiRefSeq" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /dev/shm/${DB}.tmp.gz + mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz + echo -n "# ${DB}: " + genePredCheck -db=${DB} genes/${DB}.gp.gz +done +# mm39: checked: 22051 failed: 0 +# rn6: checked: 22857 failed: 0 +# casCan1: checked: 21652 failed: 0 +# speTri2: checked: 19892 failed: 0 +# cavPor3: checked: 19940 failed: 0 +# oryCun2: checked: 20276 failed: 0 +# ochPri3: checked: 18520 failed: 0 +# panTro6: checked: 21380 failed: 0 +# panPan3: checked: 22261 failed: 0 +# gorGor6: checked: 20581 failed: 0 +# rheMac10: checked: 21021 failed: 0 +# calJac4: checked: 22183 failed: 0 +# tarSyr2: checked: 19968 failed: 0 +# otoGar3: checked: 19536 failed: 0 +# galVar1: checked: 22794 failed: 0 +# susScr3: checked: 24180 failed: 0 +# bosTau9: checked: 21001 failed: 0 +# oviAri4: checked: 20513 failed: 0 +# equCab3: checked: 21079 failed: 0 +# neoSch1: checked: 18783 failed: 0 +# eriEur2: checked: 19258 failed: 0 +# sorAra2: checked: 19160 failed: 0 +# loxAfr3: checked: 21061 failed: 0 +# echTel2: checked: 18790 failed: 0 +# galGal6: checked: 17412 failed: 0 +# xenTro9: checked: 21265 failed: 0 +# danRer11: checked: 32676 failed: 0 + + # 3. xenoRefGene - turTru2 manPen1 canFam4 petMar3 + + for DB in turTru2 manPen1 canFam4 petMar3 +do + hgsql -N -e "select +name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds +from xenoRefGene" ${DB} \ + | genePredSingleCover stdin stdout | gzip -2c \ + > /dev/shm/${DB}.tmp.gz + mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz + echo -n "# ${DB}: " + genePredCheck -db=${DB} genes/${DB}.gp.gz +done +# turTru2: checked: 36500 failed: 0 +# manPen1: checked: 30879 failed: 0 +# canFam4: checked: 20127 failed: 0 +# petMar3: checked: 10819 failed: 0 + + + # verify counts for genes are reasonable: + for T in genes/*.gz +do + echo -n "# $T: " + zcat $T | cut -f1 | sort | uniq -c | wc -l +done +# genes/GCF_003668045v3.gp.gz: 22354 +# genes/bosTau9.gp.gz: 20992 +# genes/calJac4.gp.gz: 22183 +# genes/canFam4.gp.gz: 19154 +# genes/casCan1.gp.gz: 21652 +# genes/cavPor3.gp.gz: 19938 +# genes/danRer11.gp.gz: 29321 +# genes/echTel2.gp.gz: 18790 +# genes/equCab3.gp.gz: 21079 +# genes/eriEur2.gp.gz: 19258 +# genes/galGal6.gp.gz: 17404 +# genes/galVar1.gp.gz: 22794 +# genes/gorGor6.gp.gz: 20579 +# genes/hg38.gp.gz: 19310 +# genes/loxAfr3.gp.gz: 21061 +# genes/manPen1.gp.gz: 26152 +# genes/mm39.gp.gz: 22051 +# genes/monDom5.gp.gz: 21033 +# genes/neoSch1.gp.gz: 18783 +# genes/ochPri3.gp.gz: 18520 +# genes/oryCun2.gp.gz: 20271 +# genes/otoGar3.gp.gz: 19536 +# genes/oviAri4.gp.gz: 20508 +# genes/panPan3.gp.gz: 22261 +# genes/panTro6.gp.gz: 21376 +# genes/petMar3.gp.gz: 9982 +# genes/rheMac10.gp.gz: 21020 +# genes/rn6.gp.gz: 22854 +# genes/sorAra2.gp.gz: 19160 +# genes/speTri2.gp.gz: 19892 +# genes/susScr3.gp.gz: 24043 +# genes/tarSyr2.gp.gz: 19968 +# genes/tupBel1.gp.gz: 15407 +# genes/turTru2.gp.gz: 29711 +# genes/xenTro9.gp.gz: 21212 + + # kluster job to annotate each maf file + screen -S mm39 # manage long running procedure with screen + ssh ku + cd /hive/data/genomes/mm39/bed/multiz35way/frames + + printf '#!/bin/csh -fe + +set C = $1 +set G = $2 + +cat ../anno/result/${C}.maf | genePredToMafFrames mm39 stdin stdout \ + ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz +' > runOne + + chmod +x runOne + + ls ../anno/result | sed -e "s/.maf//" > chr.list + ls genes | sed -e "s/.gp.gz//" > gene.list + + printf '#LOOP +runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} +#ENDLOOP +' > template + + mkdir parts + gensub2 chr.list gene.list template jobList + para -ram=64g create jobList + para try ... check ... push +# Completed: 1890 of 1890 jobs +# CPU time in finished jobs: 68240s 1137.33m 18.96h 0.79d 0.002 y +# IO & Wait Time: 62137s 1035.62m 17.26h 0.72d 0.002 y +# Average job time: 69s 1.15m 0.02h 0.00d +# Longest finished job: 372s 6.20m 0.10h 0.00d +# Submission to last job: 1098s 18.30m 0.30h 0.01d + + # collect all results into one file: + cd /hive/data/genomes/mm39/bed/multiz35way/frames + time find ./parts -type f | grep -v GCF | while read F +do + echo "${F}" 1>&2 + zcat ${F} +done | sort -k1,1 -k2,2n > multiz35wayFrames.bed + # real 2m4.953s + time find ./parts -type f | grep GCF | while read F +do + echo "${F}" 1>&2 + zcat ${F} | sed -e 's/GCF_003668045v3/GCF_003668045.3/;' +done | sort -k1,1 -k2,2n >> multiz35wayFrames.bed + + # -rw-rw-r-- 1 592159010 Dec 22 09:24 multiz35wayFrames.bed + + gzip multiz35wayFrames.bed + + # verify there are frames on everything, should be 35 species: + # (count from: ls genes | wc) + zcat multiz35wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ + | sed -e 's/^/# /;' > species.check.list + wc -l species.check.list + # 35 +# 240536 GCF_003668045.3 +# 261904 bosTau9 +# 266491 calJac4 +# 230600 canFam4 +# 232036 casCan1 +# 240423 cavPor3 +# 323912 danRer11 +# 229208 echTel2 +# 274281 equCab3 +# 246825 eriEur2 +# 346334 galGal6 +# 371538 galVar1 +# 245105 gorGor6 +# 237059 hg38 +# 263806 loxAfr3 +# 224191 manPen1 +# 200489 mm39 +# 348971 monDom5 +# 242155 neoSch1 +# 226440 ochPri3 +# 236903 oryCun2 +# 234706 otoGar3 +# 296794 oviAri4 +# 264590 panPan3 +# 263990 panTro6 +# 146858 petMar3 +# 261855 rheMac10 +# 235340 rn6 +# 239030 sorAra2 +# 230420 speTri2 +# 257781 susScr3 +# 231443 tarSyr2 +# 196941 tupBel1 +# 258704 turTru2 +# 322154 xenTro9 + + # load the resulting file + ssh hgwdev + cd /hive/data/genomes/mm39/bed/multiz35way/frames + time hgLoadMafFrames mm39 multiz35wayFrames multiz35wayFrames.bed.gz + # real 1m0.040s + + hgsql -e 'select count(*) from multiz35wayFrames;' mm39 + # +----------+ + # | count(*) | + # +----------+ + # | 8929813 | + # +----------+ + + time featureBits -countGaps mm39 multiz35wayFrames + # 60020650 bases of 2728222451 (2.200%) in intersection + # real 0m43.619s + + # enable the trackDb entries: +# frames multiz35wayFrames +# irows on + # zoom to base level in an exon to see codon displays + # appears to work OK + +######################################################################### +# Phylogenetic tree from 35-way (TBD - 2013-09-13 - Hiram) + mkdir /hive/data/genomes/mm39/bed/multiz35way/4d + cd /hive/data/genomes/mm39/bed/multiz35way/4d + + # the annotated maf's are in: + ../anno/result/*.maf + + # using knownGene for mm39, only transcribed genes and nothing + # from the randoms and other misc. + hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ncbiRefSeq where cdsEnd > cdsStart;" mm39 \ + | egrep -E -v "chrM|chrUn|random|_alt" > ncbiRefSeq.gp + wc -l *.gp + # 95199 ncbiRefSeq.gp + genePredCheck -db=mm39 ncbiRefSeq.gp + # checked: 92405 failed: 0 + + # verify it is only on the chroms: + cut -f2 ncbiRefSeq.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' + # 7748 chr2 + # 7617 chr7 + # 6852 chr11 + # 5889 chr4 + # 5843 chr1 + # 5810 chr5 + # 5514 chr9 + # 4700 chr6 + # 4477 chr8 + # 4357 chr10 + # 4310 chr17 + # 4158 chr3 + # 3615 chr14 + # 3530 chr15 + # 3479 chrX + # 3254 chr12 + # 2968 chr13 + # 2952 chr16 + # 2718 chr19 + # 2301 chr18 + # 313 chrY + + genePredSingleCover ncbiRefSeq.gp stdout | sort > ncbiRefSeqNP.gp + wc -l ncbiRefSeqNP.gp + # 21990 ncbiRefSeqNP.gp + genePredCheck -db=mm39 ncbiRefSeqNP.gp + # checked: 21990 failed: 0 + + ssh ku + mkdir /hive/data/genomes/mm39/bed/multiz35way/4d/run + cd /hive/data/genomes/mm39/bed/multiz35way/4d/run + mkdir ../mfa + + # newer versions of msa_view have a slightly different operation + # the sed of the gp file inserts the reference species in the chr name + cat << '_EOF_' > 4d.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set r = "/hive/data/genomes/mm39/bed/multiz35way" +set c = $1 +set infile = $r/anno/result/$2 +set outfile = $3 +cd /dev/shm +# 'clean' maf, removes all chrom names, leaves only the db name +perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf +awk -v C=$c '$2 == C {print}' $r/4d/ncbiRefSeqNP.gp | sed -e "s/\t$c\t/\tmm39.$c\t/" > $c.gp +set NL=`wc -l $c.gp| gawk '{print $1}'` +if ("$NL" != "0") then + $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss + $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile +else + echo "" > $r/4d/run/$outfile +endif +rm -f $c.gp $c.maf $c.ss +'_EOF_' + # << happy emacs + chmod +x 4d.csh + + ls -1S /hive/data/genomes/mm39/bed/multiz35way/anno/result/*.maf \ + | sed -e "s#.*multiz35way/anno/result/##" \ + | egrep -E -v "chrM|chrUn|random|_alt" > maf.list + + printf '#LOOP +4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} +#ENDLOOP +' > template + + gensub2 maf.list single template jobList + para -ram=64g create jobList + para try ... check ... push ... etc... + para time +# Completed: 21 of 21 jobs +# CPU time in finished jobs: 5711s 95.18m 1.59h 0.07d 0.000 y +# IO & Wait Time: 62s 1.03m 0.02h 0.00d 0.000 y +# Average job time: 275s 4.58m 0.08h 0.00d +# Longest finished job: 470s 7.83m 0.13h 0.01d +# Submission to last job: 471s 7.85m 0.13h 0.01d + + # combine mfa files + ssh hgwdev + cd /hive/data/genomes/mm39/bed/multiz35way/4d + # verify no tiny files: + ls -og mfa | sort -k3nr | tail -2 + # -rw-rw-r-- 1 293090 Dec 22 11:46 chrY.mfa + + #want comma-less species.list + time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ + --aggregate "`sed -e 's/GCF_003668045.3/GCF_003668045v3/;' \ + ../species.list`" mfa/*.mfa | sed s/"> "/">"/ > 4d.all.mfa + # real 0m3.182s + + # check they are all in there: + grep "^>" 4d.all.mfa | wc -l + # 35 + + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + mm39.35way.nh + + sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ + ../mm39.35way.nh | sed -e 's/GCF_003668045.3/GCF_003668045v3/;' \ + > tree-commas.nh + + # use phyloFit to create tree model (output is phyloFit.mod) + time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ + --EM --precision MED --msa-format FASTA --subst-mod REV \ + --tree tree-commas.nh 4d.all.mfa + # real 67m7.237s + + mv phyloFit.mod all.mod + + grep TREE all.mod +TREE: ((((((((((((((mm39:0.0843874,rn6:0.0899985):0.0657921, +GCF_003668045v3:0.106248):0.151018,casCan1:0.135335):0.0230996, +speTri2:0.144923):0.00521622,cavPor3:0.232945):0.026955, +(oryCun2:0.11389,ochPri3:0.195179):0.105379):0.0128213, +((((((((hg38:0.0109653,panTro6:0.00222548):0.00206435, +panPan3:0.00221569):0.00629185,gorGor6:0.00877389):0.0221025, +rheMac10:0.0361164):0.0217292,calJac4:0.0658044):0.0586954, +tarSyr2:0.140238):0.0112568,otoGar3:0.155854):0.0148223, +(tupBel1:0.182689,galVar1:0.106299):0.00938657):0.00641):0.0206, +(((susScr3:0.119945,(turTru2:0.0637542,(bosTau9:0.0311355, +oviAri4:0.0369141):0.0938814):0.0195077):0.0461252, +((equCab3:0.105047,manPen1:0.137597):0.00716862, +(canFam4:0.0869943,neoSch1:0.0695671):0.0650575):0.00523737):0.0112043, +(eriEur2:0.221845,sorAra2:0.280481):0.0694824):0.0210386):0.0285152, +(loxAfr3:0.103064,echTel2:0.240863):0.0478413):0.237009, +monDom5:0.34975):0.214304,galGal6:0.444509):0.287968, +xenTro9:0.766181):0.454886,danRer11:0.852917):0.44051,petMar3:0.44051); + + # compare these calculated lengths to what we started with + + /cluster/bin/phast/all_dists ../mm39.35way.nh | grep mm39 \ + | sed -e "s/mm39.//;" | sort > original.dists + + grep TREE all.mod | sed -e 's/TREE: //;' \ + | /cluster/bin/phast/all_dists /dev/stdin | grep mm39 \ + | sed -e "s/mm39.//; s/GCF_003668045v3/GCF_003668045.3/;" | sort > mm39.dists + + # printing out the 'original', the 'new' the 'difference' and + # percent difference/delta +echo "# sequence old dist new dist difference per-cent diff" + join original.dists mm39.dists | awk '{ + printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%5.2f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' | sort -k4n +# sequence old dist new dist difference per-cent diff + +# rn6 0.186098 0.174386 0.011712 6.72 +# GCF_003668045.3 0.332282 0.256428 0.075854 29.58 +# casCan1 0.412432 0.436532 -0.024100 -5.52 +# speTri2 0.417900 0.469220 -0.051320 -10.94 +# galVar1 0.493420 0.491385 0.002035 0.41 +# panPan3 0.502461 0.512813 -0.010352 -2.02 +# gorGor6 0.500585 0.513080 -0.012495 -2.44 +# panTro6 0.502681 0.514887 -0.012206 -2.37 +# rheMac10 0.510018 0.518320 -0.008302 -1.60 +# hg38 0.502391 0.523627 -0.021236 -4.06 +# calJac4 0.494237 0.526279 -0.032042 -6.09 +# equCab3 0.520098 0.539586 -0.019488 -3.61 +# tarSyr2 0.503897 0.542017 -0.038120 -7.03 +# otoGar3 0.511897 0.546376 -0.034479 -6.31 +# turTru2 0.540150 0.551520 -0.011370 -2.06 +# neoSch1 0.585546 0.561994 0.023552 4.19 +# cavPor3 0.491203 0.562458 -0.071255 -12.67 +# tupBel1 0.549623 0.567775 -0.018152 -3.20 +# loxAfr3 0.556386 0.569310 -0.012924 -2.27 +# manPen1 0.485701 0.572136 -0.086435 -15.11 +# oryCun2 0.556860 0.575737 -0.018877 -3.28 +# canFam4 0.543004 0.579422 -0.036418 -6.29 +# susScr3 0.549974 0.588203 -0.038229 -6.50 +# bosTau9 0.599054 0.612782 -0.013728 -2.24 +# oviAri4 0.574054 0.618561 -0.044507 -7.20 +# ochPri3 0.643702 0.657026 -0.013324 -2.03 +# eriEur2 0.676481 0.702256 -0.025775 -3.67 +# echTel2 0.703393 0.707109 -0.003716 -0.53 +# sorAra2 0.724258 0.760892 -0.036634 -4.81 +# monDom5 0.921254 1.005164 -0.083910 -8.35 +# galGal6 1.326078 1.314227 0.011851 0.90 +# xenTro9 1.214580 1.923867 -0.709287 -36.87 +# danRer11 2.279062 2.465489 -0.186427 -7.56 +# petMar3 2.253737 2.493592 -0.239855 -9.62 + +######################################################################### +# phastCons 35-way (DONE - 2020-12-22 - Hiram) + # split 35way mafs into 10M chunks and generate sufficient statistics + # files for # phastCons + ssh ku + mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/ss + mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/msa.split + cd /hive/data/genomes/mm39/bed/multiz35way/cons/msa.split + + cat << '_EOF_' > doSplit.csh +#!/bin/csh -ef +set c = $1 +set MAF = /hive/data/genomes/mm39/bed/multiz35way/anno/result/$c.maf +set WINDOWS = /hive/data/genomes/mm39/bed/multiz35way/cons/ss/$c +set WC = `cat $MAF | wc -l` +set NL = `grep "^#" $MAF | wc -l` +if ( -s $2 ) then + exit 0 +endif +if ( -s $2.running ) then + exit 0 +endif + +date >> $2.running + +rm -fr $WINDOWS +mkdir $WINDOWS +pushd $WINDOWS > /dev/null +if ( $WC != $NL ) then +/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ + $MAF -i MAF -o SS -r $WINDOWS/$c -w 3000000,0 -I 300 -B 5000 +endif +popd > /dev/null +date >> $2 +rm -f $2.running +'_EOF_' + # << happy emacs + chmod +x doSplit.csh + + printf '#LOOP +doSplit.csh $(root1) {check out line+ $(root1).done} +#ENDLOOP +' > template + +# do the easy ones first to see some immediate results + ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list + # all can finish OK at a 64Gb memory limit + gensub2 maf.list single template jobList + para -ram=64g create jobList + para try ... check ... etc + para push +# Completed: 54 of 54 jobs +# CPU time in finished jobs: 24295s 404.91m 6.75h 0.28d 0.001 y +# IO & Wait Time: 1111s 18.52m 0.31h 0.01d 0.000 y +# Average job time: 470s 7.84m 0.13h 0.01d +# Longest finished job: 2724s 45.40m 0.76h 0.03d +# Submission to last job: 2765s 46.08m 0.77h 0.03d + + # Run phastCons + # This job is I/O intensive in its output files, beware where this + # takes place or do not run too many at once. + ssh ku + mkdir -p /hive/data/genomes/mm39/bed/multiz35way/cons/run.cons + cd /hive/data/genomes/mm39/bed/multiz35way/cons/run.cons + + # This is setup for multiple runs based on subsets, but only running + # the 'all' subset here. + # It triggers off of the current working directory + # $cwd:t which is the "grp" in this script. Running: + # all and vertebrates + + cat << '_EOF_' > doPhast.csh +#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set c = $1 +set f = $2 +set len = $3 +set cov = $4 +set rho = $5 +set grp = $cwd:t +set cons = /hive/data/genomes/mm39/bed/multiz35way/cons +set tmp = $cons/tmp/$f +mkdir -p $tmp +set ssSrc = $cons/ss +set useGrp = "$grp.mod" +if (-s $cons/$grp/$grp.non-inf) then + ln -s $cons/$grp/$grp.mod $tmp + ln -s $cons/$grp/$grp.non-inf $tmp + ln -s $ssSrc/$c/$f.ss $tmp +else + ln -s $ssSrc/$c/$f.ss $tmp + ln -s $cons/$grp/$grp.mod $tmp +endif +pushd $tmp > /dev/null +if (-s $grp.non-inf) then + $PHASTBIN/phastCons $f.ss $useGrp \ + --rho $rho --expected-length $len --target-coverage $cov --quiet \ + --not-informative `cat $grp.non-inf` \ + --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp +else + $PHASTBIN/phastCons $f.ss $useGrp \ + --rho $rho --expected-length $len --target-coverage $cov --quiet \ + --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp +endif +popd > /dev/null +mkdir -p pp/$c bed/$c +sleep 4 +touch pp/$c bed/$c +rm -f pp/$c/$f.pp +rm -f bed/$c/$f.bed +mv $tmp/$f.pp pp/$c +mv $tmp/$f.bed bed/$c +rm -fr $tmp +'_EOF_' + # << happy emacs + chmod +x doPhast.csh + + # this template will serve for all runs + # root1 == chrom name, file1 == ss file name without .ss suffix + printf '#LOOP +../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} +#ENDLOOP +' > template + + ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list + wc -l ss.list + # 930 ss.list + + # Create parasol batch and run it + # run for all species + cd /hive/data/genomes/mm39/bed/multiz35way/cons + mkdir -p all + cd all + # Using the .mod tree + cp -p ../../4d/all.mod ./all.mod + + gensub2 ../run.cons/ss.list single ../run.cons/template jobList + # beware overwhelming the cluster with these fast running high I/O jobs + para -ram=32g create jobList + para try ... check ... + para -maxJob=16 push +# Completed: 930 of 930 jobs +# CPU time in finished jobs: 22874s 381.23m 6.35h 0.26d 0.001 y +# IO & Wait Time: 6698s 111.64m 1.86h 0.08d 0.000 y +# Average job time: 32s 0.53m 0.01h 0.00d +# Longest finished job: 49s 0.82m 0.01h 0.00d +# Submission to last job: 200s 3.33m 0.06h 0.00d + + # create Most Conserved track + cd /hive/data/genomes/mm39/bed/multiz35way/cons/all + time cut -f1 ../../../../chrom.sizes | while read C +do + echo $C 1>&2 + ls -d bed/${C} 2> /dev/null | while read D + do + cat ${D}/${C}*.bed + done | sort -k1,1 -k2,2n \ + | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' +done > tmpMostConserved.bed + # real 0m43.805s + + # -rw-rw-r-- 1 164925477 Dec 22 14:02 tmpMostConserved.bed + + time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ + > mostConserved.bed + # real 0m26.890s + + # -rw-rw-r-- 1 169120947 Dec 22 14:04 mostConserved.bed + + # load into database + ssh hgwdev + cd /hive/data/genomes/mm39/bed/multiz35way/cons/all + time hgLoadBed mm39 phastConsElements35way mostConserved.bed + # Read 4829247 elements of size 5 from mostConserved.bed + # real 0m36.697s + + # Try for 5% overall cov, and 70% CDS cov + # --rho 0.3 --expected-length 45 --target-coverage 0.3 + time featureBits mm39 -enrichment ncbiRefSeq:cds phastConsElements35way +# ncbiRefSeq:cds 1.408%, phastConsElements35way 5.679%, both 0.993%, cover 70.51%, enrich 12.42x +# real 0m31.631s + + + time featureBits mm39 -enrichment refGene:cds phastConsElements35way +# refGene:cds 1.333%, phastConsElements35way 5.679%, both 0.972%, cover 72.87%, enrich 12.83x +# real 0m24.183s + + # Create merged posterier probability file and wiggle track data files + cd /hive/data/genomes/mm39/bed/multiz35way/cons/all + mkdir downloads + + time for D in `ls -d pp/chr* | sed -e 's#pp/##'` +do + echo "working: $D" 1>&2 + find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/${D}.phastCons35way.wigFix.gz +done + # real 15m34.013s + + # encode those files into wiggle data + time (zcat downloads/*.wigFix.gz \ + | wigEncode stdin phastCons35way.wig phastCons35way.wib) + # Converted stdin, upper limit 1.00, lower limit 0.00 + # real 6m57.150s + + du -hsc *.wi? + # 1.9G phastCons35way.wib + # 223M phastCons35way.wig + + # encode into a bigWig file: + # (warning wigToBigWig process may be too large for memory limits + # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: +export sizeG=188743680 +ulimit -d $sizeG +ulimit -v $sizeG + time (zcat downloads/*.wigFix.gz \ + | wigToBigWig -verbose=2 stdin \ + ../../../../chrom.sizes phastCons35way.bw) > bigWig.log 2>&1 + egrep "VmPeak|real" bigWig.log + # pid=29661: VmPeak: 22610628 kB + # real 21m44.557s + + # -rw-rw-r-- 1 4567105688 Dec 22 16:47 phastCons35way.bw + + bigWigInfo phastCons35way.bw +version: 4 +isCompressed: yes +isSwapped: 0 +primaryDataSize: 3,226,958,317 +primaryIndexSize: 75,088,076 +zoomLevels: 10 +chromCount: 54 +basesCovered: 2,035,330,949 +mean: 0.124503 +min: 0.000000 +max: 1.000000 + + + # if you wanted to use the bigWig file, loading bigWig table: + # but we don't use the bigWig file + mkdir /gbdb/mm39/bbi + ln -s `pwd`/phastCons35way.bw /gbdb/mm39/bbi + hgsql mm39 -e 'drop table if exists phastCons35way; \ + create table phastCons35way (fileName varchar(255) not null); \ + insert into phastCons35way values + ("/gbdb/mm39/bbi/phastCons35way.bw");' + + # Load gbdb and database with wiggle. + ssh hgwdev + cd /hive/data/genomes/mm39/bed/multiz35way/cons/all + ln -s `pwd`/phastCons35way.wib /gbdb/mm39/multiz35way/phastCons35way.wib + time hgLoadWiggle -pathPrefix=/gbdb/mm39/multiz35way mm39 \ + phastCons35way phastCons35way.wig + # real 0m18.870s + + time wigTableStats.sh mm39 phastCons35way +# db.table min max mean count sumData +# mm39.phastCons35way 0 1 0.124503 2035330949 2.53404e+08 +# stdDev viewLimits +# 0.262545 viewLimits=0:1 + # real 0m8.782s + + # Create histogram to get an overview of all the data + ssh hgwdev + cd /hive/data/genomes/mm39/bed/multiz35way/cons/all + time hgWiggle -doHistogram -db=mm39 \ + -hBinSize=0.001 -hBinCount=300 -hMinVal=0.0 -verbose=2 \ + phastCons35way > histogram.data 2>&1 + # real 1m30.048s + + # create plot of histogram: + +XXX - need to update this gnuplot definition: + + printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ +"/usr/share/fonts/default/Type1/n022004l.pfb" +set size 1.4, 0.8 +set key left box +set grid noxtics +set grid ytics +set title " Human Mm39 Histogram phastCons35way track" +set xlabel " phastCons35way score" +set ylabel " Relative Frequency" +set y2label " Cumulative Relative Frequency (CRF)" +set y2range [0:1] +set y2tics +set yrange [0:0.02] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines +' | gnuplot > histo.png + + # take a look to see if it is sane: + + display histo.png & + +######################################################################### +# phyloP for 35-way (DONE - 2020-12-22 - Hiram) +# + # split SS files into 1M chunks, this business needs smaller files + # to complete + + ssh ku + mkdir /hive/data/genomes/mm39/bed/multiz35way/consPhyloP + cd /hive/data/genomes/mm39/bed/multiz35way/consPhyloP + mkdir ss run.split + cd run.split + + printf '#!/bin/csh -ef +set c = $1 +set MAF = /hive/data/genomes/mm39/bed/multiz35way/anno/result/$c.maf +set WINDOWS = /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/ss/$c +set WC = `cat $MAF | wc -l` +set NL = `grep "^#" $MAF | wc -l` +if ( -s $2 ) then + exit 0 +endif +if ( -s $2.running ) then + exit 0 +endif + +date >> $2.running + +rm -fr $WINDOWS +mkdir -p $WINDOWS +pushd $WINDOWS > /dev/null +if ( $WC != $NL ) then +/cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ + $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 +endif +popd > /dev/null +date >> $2 +rm -f $2.running +' > doSplit.csh + + chmod +x doSplit.csh + + # do the easy ones first to see some immediate results + ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list + + # this needs a {check out line+ $(root1.done)} test for verification: + printf '#LOOP +./doSplit.csh $(root1) $(root1).done +#ENDLOOP +' > template + + gensub2 maf.list single template jobList + # all can complete successfully at the 64Gb memory limit + para -ram=64g create jobList + para try ... check ... push ... etc... +# Completed: 54 of 54 jobs +# CPU time in finished jobs: 25321s 422.01m 7.03h 0.29d 0.001 y +# IO & Wait Time: 843s 14.06m 0.23h 0.01d 0.000 y +# Average job time: 485s 8.08m 0.13h 0.01d +# Longest finished job: 2873s 47.88m 0.80h 0.03d +# Submission to last job: 2931s 48.85m 0.81h 0.03d + + # run phyloP with score=LRT + ssh ku + mkdir /cluster/data/mm39/bed/multiz35way/consPhyloP + cd /cluster/data/mm39/bed/multiz35way/consPhyloP + + mkdir run.phyloP + cd run.phyloP + # Adjust model file base composition background and rate matrix to be + # representative of the chromosomes in play + grep BACK ../../4d/all.mod + # BACKGROUND: 0.219062 0.338786 0.207231 0.234921 + + grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' + # 0.546 + /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ + ../../4d/all.mod 0.546 > all.mod + # verify, the BACKGROUND should now be paired up: + grep BACK all.mod + # BACKGROUND: 0.217500 0.282500 0.282500 0.217500 + + printf '#!/bin/csh -fe +set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin +set f = $1 +set ssFile = $1:t +set out = $2 +set cName = $f:h +set n = $f:r:e +set grp = $cwd:t +set cons = /hive/data/genomes/mm39/bed/multiz35way/consPhyloP +set tmp = $cons/tmp/$grp/$f +/bin/rm -fr $tmp +/bin/mkdir -p $tmp +set ssSrc = "$cons/ss/$cName/$ssFile" +set useGrp = "$grp.mod" +/bin/ln -s $cons/run.phyloP/$grp.mod $tmp +pushd $tmp > /dev/null +echo source: $ssSrc.ss +$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ + -i SS $useGrp $ssSrc.ss > $ssFile.wigFix +popd > /dev/null +/bin/mkdir -p $out:h +sleep 4 +/bin/touch $out:h +/bin/mv $tmp/$ssFile.wigFix $out +/bin/rm -fr $tmp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp +/bin/rmdir --ignore-fail-on-non-empty $cons/tmp +' > doPhyloP.csh + + chmod +x doPhyloP.csh + + # Create list of chunks + find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list + # make sure the list looks good + wc -l ss.list + # 3308 ss.list + + # Create template file + # file1 == $chr/$chunk/file name without .ss suffix + printf '#LOOP +../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} +#ENDLOOP +' > template + + ###################### Running all species ####################### + # setup run for all species + mkdir /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/all + cd /hive/data/genomes/mm39/bed/multiz35way/consPhyloP/all + rm -fr wigFix + mkdir wigFix + + gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList + # beware overloading the cluster with these quick and high I/O jobs + para -ram=32g create jobList + para try ... check ... + para -maxJob=16 push + para time > run.time +# Completed: 2699 of 2699 jobs +# CPU time in finished jobs: 839140s 13985.67m 233.09h 9.71d 0.027 y +# IO & Wait Time: 17961s 299.35m 4.99h 0.21d 0.001 y +# Average job time: 318s 5.29m 0.09h 0.00d +# Longest finished job: 657s 10.95m 0.18h 0.01d +# Submission to last job: 3669s 61.15m 1.02h 0.04d + + mkdir downloads + time for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` +do + echo "working: $D" 1>&2 + find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ + | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ + | gzip -c > downloads/${D}.phyloP35way.wigFix.gz +done + # real 30m48.598s + + du -hsc downloads + # 3.4G downloads + + # check integrity of data with wigToBigWig + time (zcat downloads/*.wigFix.gz \ + | wigToBigWig -verbose=2 stdin /hive/data/genomes/mm39/chrom.sizes \ + phyloP35way.bw) > bigWig.log 2>&1 + + egrep "real|VmPeak" bigWig.log + # pid=123418: VmPeak: 22609988 kB + # real 22m26.776s + + bigWigInfo phyloP35way.bw | sed -e 's/^/# /;' +# version: 4 +# isCompressed: yes +# isSwapped: 0 +# primaryDataSize: 4,660,484,132 +# primaryIndexSize: 75,089,444 +# zoomLevels: 10 +# chromCount: 53 +# basesCovered: 2,035,330,611 +# mean: 0.110677 +# min: -13.709000 +# max: 4.643000 +# std: 0.833332 + + # encode those files into wiggle data + time (zcat downloads/*.wigFix.gz \ + | wigEncode stdin phyloP35way.wig phyloP35way.wib) +# Converted stdin, upper limit 4.64, lower limit -13.71 +# real 8m41.899s + +# -rw-rw-r-- 1 2035330611 Dec 22 18:40 phyloP35way.wib +# -rw-rw-r-- 1 239980863 Dec 22 18:40 phyloP35way.wig + + du -hsc *.wi? + # 1.9G phyloP35way.wib + # 229M phyloP35way.wig + + # Load gbdb and database with wiggle. + ln -s `pwd`/phyloP35way.wib /gbdb/mm39/multiz35way/phyloP35way.wib + time hgLoadWiggle -pathPrefix=/gbdb/mm39/multiz35way mm39 \ + phyloP35way phyloP35way.wig + # real 0m15.232s + + # use to set trackDb.ra entries for wiggle min and max + # and verify table is loaded correctly + + wigTableStats.sh mm39 phyloP35way +# db.table min max mean count sumData +# mm39.phyloP35way -13.709 4.643 0.110677 2035330611 2.25264e+08 +# stdDev viewLimits +# 0.833332 viewLimits=-4.05598:4.27734 + + # that range is: 13.709+4.643 = 18.352 for hBinSize=0.018352 + + # Create histogram to get an overview of all the data + time hgWiggle -doHistogram \ + -hBinSize=0.018352 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ + -db=mm39 phyloP35way > histogram.data 2>&1 + # real 1m14.295s + + +XXX - ready to graph - Tue Dec 22 21:28:59 PST 2020 + # xaxis range: + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=2 stdin \ + | sed -e 's/^/# /;' +# Q1 -10.953050 +# median -6.861155 +# Q3 -2.769245 +# average -6.875971 +# min -20.000000 +# max 1.312000 +# count 768 +# total -5280.745380 +# standard deviation 4.757034 + + # find out the range for the 2:5 graph + grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ + | sed -e 's/^/# /;' +# Q1 0.000000 +# median 0.000001 +# Q3 0.000140 +# average 0.001302 +# min 0.000000 +# max 0.023556 +# count 768 +# total 0.999975 +# standard deviation 0.003490 + + # create plot of histogram: + printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ +"/usr/share/fonts/default/Type1/n022004l.pfb" +set size 1.4, 0.8 +set key left box +set grid noxtics +set grid ytics +set title " Human mm39 Histogram phyloP35way track" +set xlabel " phyloP35way score" +set ylabel " Relative Frequency" +set y2label " Cumulative Relative Frequency (CRF)" +set y2range [0:1] +set y2tics +set xrange [-5:1.5] +set yrange [0:0.04] + +plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ + "histogram.data" using 2:7 axes x1y2 title " CRF" with lines +' | gnuplot > histo.png + + # verify it looks sane + display histo.png & + +############################################################################# +# construct download files for 35-way (TBD - 2015-04-15 - Hiram) + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way + mkdir /hive/data/genomes/mm39/bed/multiz35way/downloads + cd /hive/data/genomes/mm39/bed/multiz35way/downloads + mkdir multiz35way phastCons35way phyloP35way + + ######################################################################### + ## create upstream refGene maf files + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way + + # Brian had to fix mafFrags so it could work with the assembly GCF + # identifier + + # bash script + +#!/bin/sh +export geneTbl="ncbiRefSeq" +for S in 300 2000 5000 +do + echo "making upstream${S}.maf" + featureBits mm39 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ + | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ + | /cluster/bin/$MACHTYPE/mafFrags mm39 multiz35way \ + stdin stdout \ + -orgs=/hive/data/genomes/mm39/bed/multiz35way/species.list \ + | gzip -c > upstream${S}.${geneTbl}.maf.gz + echo "done upstream${S}.${geneTbl}.maf.gz" +done + # real 117m33.236s + +-rw-rw-r-- 1 96310644 Jan 4 17:04 upstream300.ncbiRefSeq.maf.gz +-rw-rw-r-- 1 948130770 Jan 4 17:41 upstream2000.ncbiRefSeq.maf.gz +-rw-rw-r-- 1 2032740393 Jan 4 18:48 upstream5000.ncbiRefSeq.maf.gz + + # verify sanity: + zgrep "^s " upstream300.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less + + # they all have the same counts, followed by single gene names: + 89641 xenTro9 + 89641 turTru2 + 89641 tupBel1 + 89641 tarSyr2 + 89641 susScr3 +... + 89641 casCan1 + 89641 canFam4 + 89641 calJac4 + 89641 bosTau9 + 89641 GCF_003668045.3 + 1 XM_982184.5 + 1 XM_981747.8 + 1 XM_981711.5 + 1 XM_981599.4 + 1 XM_977914.8 +... etc ... + + # same pattern seen with the 2000 and 5000 upstreams + zgrep "^s " upstream2000.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less + + zgrep "^s " upstream5000.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less + + ###################################################################### + ## compress the maf files + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way + mkdir maf + time rsync -a -P ../../maf/ ./maf/ + # real 12m9.290s + + du -hscL maf/ ../../maf/ + # 141G maf/ + # 141G ../../maf/ + + cd maf + time gzip *.maf & + # about an hour + + du -hscL maf ../../maf/ + # 16G maf + # 141G ../../maf/ + + cd maf + md5sum *.maf.gz > md5sum.txt + + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf + cd maf + ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf + cd -- + ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/ + + ########################################################################### + + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way + grep TREE ../../4d/all.mod | awk '{print $NF}' \ + | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ + > mm39.35way.nh + + sed -f ../../db.to.name.sed mm39.35way.nh \ + | sed -e "s#_x_#'#g; s#X__#X._#;" > mm39.35way.commonNames.nh + + sed -f ../../db.to.sciName.sed mm39.35way.nh \ + > mm39.35way.scientificNames.nh + + time md5sum *.nh *.maf.gz > md5sum.txt + # real 0m3.147s + + ln -s `pwd`/*.maf.gz `pwd`/*.nh \ + /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way + + du -hscL ./maf ../../maf + # 18G ./maf + # 156G ../../maf + + # obtain the README.txt from danRer10/multiz12way and update for this + # situation + ln -s `pwd`/*.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/ + + ##################################################################### + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way + + mkdir mm39.35way.phastCons + cd mm39.35way.phastCons + ln -s ../../../cons/all/downloads/*.wigFix.gz . + md5sum *.gz > md5sum.txt + + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way + ln -s ../../cons/all/phastCons35way.bw ./mm39.phastCons35way.bw + ln -s ../../cons/all/all.mod ./mm39.phastCons35way.mod + time md5sum *.mod *.bw > md5sum.txt + # real 0m20.354s + + # obtain the README.txt from danRer10/phastCons12way and update for this + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons + cd mm39.35way.phastCons + ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons + + cd .. + # situation + ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way + + ##################################################################### + cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phyloP35way + + mkdir mm39.35way.phyloP + cd mm39.35way.phyloP + + ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . + md5sum *.wigFix.gz > md5sum.txt + + cd .. + + ln -s ../../consPhyloP/run.phyloP/all.mod mm39.phyloP35way.mod + ln -s ../../consPhyloP/all/phyloP35way.bw mm39.phyloP35way.bw + + md5sum *.mod *.bw > md5sum.txt + + # obtain the README.txt from danRer10/phyloP12way and update for this + # situation + mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP + cd mm39.35way.phyloP + ln -s `pwd`/* \ +/usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP + + cd .. + + ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ + /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way + +############################################################################# +# hgPal downloads (DONE - 2020-12-23 - Hiram) +# FASTA from 35-way for ncbiRefSeq, refGene and knownCanonical + + ssh hgwdev + screen -S mm39HgPal + mkdir /hive/data/genomes/mm39/bed/multiz35way/pal + cd /hive/data/genomes/mm39/bed/multiz35way/pal + cat ../species.list | tr '[ ]' '[\n]' > order.list + + # ncbiRefSeq + export mz=multiz35way + export gp=ncbiRefSeq + export db=mm39 + export I=0 + export D=0 + mkdir exonAA exonNuc + for C in `sort -nk2 ../../../chrom.sizes | cut -f1` + do + I=`echo $I | awk '{print $1+1}'` + D=`echo $D | awk '{print $1+1}'` + dNum=`echo $D | awk '{printf "%03d", int($1/300)}'` + mkdir -p exonNuc/${dNum} > /dev/null + mkdir -p exonAA/${dNum} > /dev/null + echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" + echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" + if [ $I -gt 16 ]; then + echo "date" + echo "wait" + I=0 + fi + done > $gp.jobs + echo "date" >> $gp.jobs + echo "wait" >> $gp.jobs + + time (sh -x ./$gp.jobs) > $gp.jobs.log 2>&1 + # real 18m43.962s + + export mz=multiz35way + export gp=ncbiRefSeq + time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ + | gzip -c > $gp.$mz.exonAA.fa.gz + # real 2m0.962s + + time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ + | gzip -c > $gp.$mz.exonNuc.fa.gz + # real 10m12.351s + + # -rw-rw-r-- 1 906052407 Dec 23 16:34 ncbiRefSeq.multiz35way.exonAA.fa.gz + # -rw-rw-r-- 1 1596566489 Dec 23 16:53 ncbiRefSeq.multiz35way.exonNuc.fa.gz + + export mz=multiz35way + export gp=ncbiRefSeq + export db=mm39 + export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments + mkdir -p $pd + md5sum *.fa.gz > md5sum.txt + ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz + ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz + ln -s `pwd`/md5sum.txt $pd/ + + rm -rf exonAA exonNuc + +############################################################################# +# wiki page for 35-way (DONE - 2021-01-11 - Hiram) + mkdir /hive/users/hiram/bigWays/mm39.35way + cd /hive/users/hiram/bigWays + echo "mm39" > mm39.35way/ordered.list +awk '{print $1}' /hive/data/genomes/mm39/bed/multiz35way/35way.distances.txt \ + >> mm39.35way/ordered.list + + # sizeStats.sh catches up the cached measurements required for data + # in the tables. They are usually already mostly done, only new + # assemblies will have updates. + ./sizeStats.sh mm39.35way/ordered.list + # dbDb.sh constructs mm39.35way/XenTro9_35-way_conservation_alignment.html + # may need to add new assembly references to srcReference.list and + # urlReference.list + # Updated this script to work with the GCF accession and finding relevant + # data from the assembly hub files + ./dbDb.sh mm39 35way + + # sizeStats.pl constructs mm39.35way/XenTro9_35-way_Genome_size_statistics.html + # this requires entries in coverage.list for new sequences + ./sizeStats.pl mm39 35way + + # defCheck.pl constructs XenTro9_35-way_conservation_lastz_parameters.html + ./defCheck.pl mm39 35way + + # this constructs the html pages in mm39.35way/: +# -rw-rw-r-- 1 17957 Jan 4 17:56 Mm39_35-way_conservation_alignment.html +# -rw-rw-r-- 1 11991 Jan 4 17:58 Mm39_35-way_conservation_lastz_parameters.html +# -rw-rw-r-- 1 23297 Jan 11 14:58 Mm39_35-way_Genome_size_statistics.html + + # add those pages to the genomewiki. Their page names are the + # names of the .html files without the .html: +# Mm39_35-way_Genome_size_statistics.html +# Mm39_35-way_conservation_alignment.html +# Mm39_35-way_conservation_lastz_parameters.html + + # when you view the first one you enter, it will have links to the + # missing two. + +############################################################################ +# pushQ readmine (DONE - 2021-01-15 - Hiram) + + cd /usr/local/apache/htdocs-hgdownload/goldenPath/mm39 + find -L `pwd`/multiz35way `pwd`/phastCons35way `pwd`/phyloP35way \ + /gbdb/mm39/multiz35way -type f \ + > /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list + wc -l /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list +# 1450 /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.26584.file.list + + cd /hive/data/genomes/mm39/bed/multiz35way/downloads + hgsql -e 'show tables;' mm39 | grep 35way \ + | sed -e 's/^/mm39./;' > redmine.26584.table.list + +############################################################################