a13aa5141f01eff76a3e30cf04b9c2a7710d90b7 hiram Mon Jan 11 16:31:50 2021 -0800 ready for pushQ entry refs #26584 diff --git src/hg/makeDb/doc/mm39/multiz35way.txt src/hg/makeDb/doc/mm39/multiz35way.txt index fbe1705..1893d50 100644 --- src/hg/makeDb/doc/mm39/multiz35way.txt +++ src/hg/makeDb/doc/mm39/multiz35way.txt @@ -1601,36 +1601,34 @@ do echo "working: $D" 1>&2 find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP35way.wigFix.gz done # real 30m48.598s du -hsc downloads # 3.4G downloads # check integrity of data with wigToBigWig time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/mm39/chrom.sizes \ phyloP35way.bw) > bigWig.log 2>&1 -XXX - running - Tue Dec 22 18:30:43 PST 2020 egrep "real|VmPeak" bigWig.log - # pid=66292: VmPeak: 33751268 kB - # real 43m40.194s - + # pid=123418: VmPeak: 22609988 kB + # real 22m26.776s bigWigInfo phyloP35way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 4,660,484,132 # primaryIndexSize: 75,089,444 # zoomLevels: 10 # chromCount: 53 # basesCovered: 2,035,330,611 # mean: 0.110677 # min: -13.709000 # max: 4.643000 # std: 0.833332 @@ -1721,73 +1719,106 @@ # verify it looks sane display histo.png & ############################################################################# # construct download files for 35-way (TBD - 2015-04-15 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way mkdir /hive/data/genomes/mm39/bed/multiz35way/downloads cd /hive/data/genomes/mm39/bed/multiz35way/downloads mkdir multiz35way phastCons35way phyloP35way ######################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way + + # Brian had to fix mafFrags so it could work with the assembly GCF + # identifier + # bash script #!/bin/sh export geneTbl="ncbiRefSeq" for S in 300 2000 5000 do echo "making upstream${S}.maf" featureBits mm39 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm39 multiz35way \ stdin stdout \ -orgs=/hive/data/genomes/mm39/bed/multiz35way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done -XXX - running - Wed Dec 23 14:36:49 PST 2020 - # real 88m40.730s - --rw-rw-r-- 1 52659159 Nov 6 11:46 upstream300.ncbiRefSeq.maf.gz --rw-rw-r-- 1 451126665 Nov 6 12:15 upstream2000.ncbiRefSeq.maf.gz --rw-rw-r-- 1 1080533794 Nov 6 12:55 upstream5000.ncbiRefSeq.maf.gz + # real 117m33.236s + +-rw-rw-r-- 1 96310644 Jan 4 17:04 upstream300.ncbiRefSeq.maf.gz +-rw-rw-r-- 1 948130770 Jan 4 17:41 upstream2000.ncbiRefSeq.maf.gz +-rw-rw-r-- 1 2032740393 Jan 4 18:48 upstream5000.ncbiRefSeq.maf.gz + + # verify sanity: + zgrep "^s " upstream300.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less + + # they all have the same counts, followed by single gene names: + 89641 xenTro9 + 89641 turTru2 + 89641 tupBel1 + 89641 tarSyr2 + 89641 susScr3 +... + 89641 casCan1 + 89641 canFam4 + 89641 calJac4 + 89641 bosTau9 + 89641 GCF_003668045.3 + 1 XM_982184.5 + 1 XM_981747.8 + 1 XM_981711.5 + 1 XM_981599.4 + 1 XM_977914.8 +... etc ... + + # same pattern seen with the 2000 and 5000 upstreams + zgrep "^s " upstream2000.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less + + zgrep "^s " upstream5000.ncbiRefSeq.maf.gz | awk '{print $2}' \ + | sort | uniq -c | sort -rn | less ###################################################################### ## compress the maf files cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way mkdir maf time rsync -a -P ../../maf/ ./maf/ # real 12m9.290s du -hscL maf/ ../../maf/ # 141G maf/ # 141G ../../maf/ cd maf time gzip *.maf & -XXX - running - Wed Dec 23 14:55:47 PST 2020 - # real 81m10.239s + # about an hour du -hscL maf ../../maf/ - # 18G maf + # 16G maf + # 141G ../../maf/ cd maf - md5sum *.maf.gz *.nh > md5sum.txt + md5sum *.maf.gz > md5sum.txt mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf cd maf ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf cd -- ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/ ########################################################################### cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm39.35way.nh @@ -1795,92 +1826,89 @@ | sed -e "s#_x_#'#g; s#X__#X._#;" > mm39.35way.commonNames.nh sed -f ../../db.to.sciName.sed mm39.35way.nh \ > mm39.35way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m3.147s ln -s `pwd`/*.maf.gz `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way du -hscL ./maf ../../maf # 18G ./maf # 156G ../../maf -XXX # obtain the README.txt from danRer10/multiz12way and update for this # situation ln -s `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/ ##################################################################### cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way mkdir mm39.35way.phastCons cd mm39.35way.phastCons ln -s ../../../cons/all/downloads/*.wigFix.gz . md5sum *.gz > md5sum.txt cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way ln -s ../../cons/all/phastCons35way.bw ./mm39.phastCons35way.bw ln -s ../../cons/all/all.mod ./mm39.phastCons35way.mod time md5sum *.mod *.bw > md5sum.txt # real 0m20.354s -XXX - # obtain the README.txt from mm39/phastCons20way and update for this + # obtain the README.txt from danRer10/phastCons12way and update for this mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons cd mm39.35way.phastCons ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons cd .. # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way ##################################################################### cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phyloP35way mkdir mm39.35way.phyloP cd mm39.35way.phyloP ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz . md5sum *.wigFix.gz > md5sum.txt cd .. ln -s ../../consPhyloP/run.phyloP/all.mod mm39.phyloP35way.mod ln -s ../../consPhyloP/all/phyloP35way.bw mm39.phyloP35way.bw md5sum *.mod *.bw > md5sum.txt -XXX - # obtain the README.txt from mm39/phyloP20way and update for this + # obtain the README.txt from danRer10/phyloP12way and update for this + # situation mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP cd mm39.35way.phyloP ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP cd .. - # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way ############################################################################# -# hgPal downloads (TBD - 2017-11-06 - Hiram) +# hgPal downloads (DONE - 2020-12-23 - Hiram) # FASTA from 35-way for ncbiRefSeq, refGene and knownCanonical ssh hgwdev screen -S mm39HgPal mkdir /hive/data/genomes/mm39/bed/multiz35way/pal cd /hive/data/genomes/mm39/bed/multiz35way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # ncbiRefSeq export mz=multiz35way export gp=ncbiRefSeq export db=mm39 export I=0 export D=0 mkdir exonAA exonNuc @@ -1919,66 +1947,70 @@ # -rw-rw-r-- 1 1596566489 Dec 23 16:53 ncbiRefSeq.multiz35way.exonNuc.fa.gz export mz=multiz35way export gp=ncbiRefSeq export db=mm39 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# -# wiki page for 35-way (TBD - 2017-11-06 - Hiram) +# wiki page for 35-way (DONE - 2021-01-11 - Hiram) mkdir /hive/users/hiram/bigWays/mm39.35way cd /hive/users/hiram/bigWays echo "mm39" > mm39.35way/ordered.list awk '{print $1}' /hive/data/genomes/mm39/bed/multiz35way/35way.distances.txt \ >> mm39.35way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh mm39.35way/ordered.list # dbDb.sh constructs mm39.35way/XenTro9_35-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list + # Updated this script to work with the GCF accession and finding relevant + # data from the assembly hub files ./dbDb.sh mm39 35way + # sizeStats.pl constructs mm39.35way/XenTro9_35-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl mm39 35way # defCheck.pl constructs XenTro9_35-way_conservation_lastz_parameters.html ./defCheck.pl mm39 35way # this constructs the html pages in mm39.35way/: -# -rw-rw-r-- 1 6247 May 2 17:07 XenTro9_35-way_conservation_alignment.html -# -rw-rw-r-- 1 8430 May 2 17:09 XenTro9_35-way_Genome_size_statistics.html -# -rw-rw-r-- 1 5033 May 2 17:10 XenTro9_35-way_conservation_lastz_parameters.html +# -rw-rw-r-- 1 17957 Jan 4 17:56 Mm39_35-way_conservation_alignment.html +# -rw-rw-r-- 1 11991 Jan 4 17:58 Mm39_35-way_conservation_lastz_parameters.html +# -rw-rw-r-- 1 23297 Jan 11 14:58 Mm39_35-way_Genome_size_statistics.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: -# XenTro9_35-way_conservation_alignment -# XenTro9_35-way_Genome_size_statistics -# XenTro9_35-way_conservation_lastz_parameters +# Mm39_35-way_Genome_size_statistics.html +# Mm39_35-way_conservation_alignment.html +# Mm39_35-way_conservation_lastz_parameters.html # when you view the first one you enter, it will have links to the # missing two. ############################################################################ -# pushQ readmine (TBD - 2017-11-07 - Hiram) +# pushQ readmine (TBD - 2021-01-11 - Hiram) +XXX - ready - Mon Jan 11 16:30:45 PST 2021 cd /usr/local/apache/htdocs-hgdownload/goldenPath/mm39 find -L `pwd`/multiz35way `pwd`/phastCons35way `pwd`/phyloP35way \ /gbdb/mm39/multiz35way -type f \ > /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList wc -l /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList # 1450 /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList cd /hive/data/genomes/mm39/bed/multiz35way/downloads hgsql -e 'show tables;' mm39 | grep 35way \ | sed -e 's/^/mm39./;' > redmine.20216.table.list ############################################################################