a13aa5141f01eff76a3e30cf04b9c2a7710d90b7
hiram
  Mon Jan 11 16:31:50 2021 -0800
ready for pushQ entry refs #26584

diff --git src/hg/makeDb/doc/mm39/multiz35way.txt src/hg/makeDb/doc/mm39/multiz35way.txt
index fbe1705..1893d50 100644
--- src/hg/makeDb/doc/mm39/multiz35way.txt
+++ src/hg/makeDb/doc/mm39/multiz35way.txt
@@ -1601,36 +1601,34 @@
 do
     echo "working: $D" 1>&2
     find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \
 	| sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \
         | gzip -c > downloads/${D}.phyloP35way.wigFix.gz
 done
     # real    30m48.598s
 
     du -hsc downloads
     #   3.4G    downloads
 
     # check integrity of data with wigToBigWig
     time (zcat downloads/*.wigFix.gz \
 	| wigToBigWig -verbose=2 stdin /hive/data/genomes/mm39/chrom.sizes \
 	phyloP35way.bw) > bigWig.log 2>&1
-XXX - running - Tue Dec 22 18:30:43 PST 2020
 
     egrep "real|VmPeak" bigWig.log
-    # pid=66292: VmPeak:    33751268 kB
-    #  real    43m40.194s
-
+    # pid=123418: VmPeak:   22609988 kB
+    # real    22m26.776s
 
     bigWigInfo phyloP35way.bw  | sed -e 's/^/# /;'
 # version: 4
 # isCompressed: yes
 # isSwapped: 0
 # primaryDataSize: 4,660,484,132
 # primaryIndexSize: 75,089,444
 # zoomLevels: 10
 # chromCount: 53
 # basesCovered: 2,035,330,611
 # mean: 0.110677
 # min: -13.709000
 # max: 4.643000
 # std: 0.833332
 
@@ -1721,73 +1719,106 @@
     # verify it looks sane
     display histo.png &
 
 #############################################################################
 # construct download files for 35-way (TBD - 2015-04-15 - Hiram)
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way
     mkdir /hive/data/genomes/mm39/bed/multiz35way/downloads
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads
     mkdir multiz35way phastCons35way phyloP35way
 
     #########################################################################
     ## create upstream refGene maf files
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way
+
+    # Brian had to fix mafFrags so it could work with the assembly GCF
+    # identifier
+
     # bash script
 
 #!/bin/sh
 export geneTbl="ncbiRefSeq"
 for S in 300 2000 5000
 do
     echo "making upstream${S}.maf"
     featureBits mm39 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \
         | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
         | /cluster/bin/$MACHTYPE/mafFrags mm39 multiz35way \
                 stdin stdout \
                 -orgs=/hive/data/genomes/mm39/bed/multiz35way/species.list \
         | gzip -c > upstream${S}.${geneTbl}.maf.gz
     echo "done upstream${S}.${geneTbl}.maf.gz"
 done
-XXX - running - Wed Dec 23 14:36:49 PST 2020
-    #   real    88m40.730s
-
--rw-rw-r-- 1   52659159 Nov  6 11:46 upstream300.ncbiRefSeq.maf.gz
--rw-rw-r-- 1  451126665 Nov  6 12:15 upstream2000.ncbiRefSeq.maf.gz
--rw-rw-r-- 1 1080533794 Nov  6 12:55 upstream5000.ncbiRefSeq.maf.gz
+    #   real    117m33.236s
+
+-rw-rw-r-- 1   96310644 Jan  4 17:04 upstream300.ncbiRefSeq.maf.gz
+-rw-rw-r-- 1  948130770 Jan  4 17:41 upstream2000.ncbiRefSeq.maf.gz
+-rw-rw-r-- 1 2032740393 Jan  4 18:48 upstream5000.ncbiRefSeq.maf.gz
+
+    # verify sanity:
+    zgrep "^s " upstream300.ncbiRefSeq.maf.gz | awk '{print $2}' \
+      | sort | uniq -c | sort -rn | less
+    
+    # they all have the same counts, followed by single gene names:
+  89641 xenTro9
+  89641 turTru2
+  89641 tupBel1
+  89641 tarSyr2
+  89641 susScr3
+...
+  89641 casCan1
+  89641 canFam4
+  89641 calJac4
+  89641 bosTau9
+  89641 GCF_003668045.3
+      1 XM_982184.5
+      1 XM_981747.8
+      1 XM_981711.5
+      1 XM_981599.4
+      1 XM_977914.8
+... etc ...
+
+    # same pattern seen with the 2000 and 5000 upstreams
+    zgrep "^s " upstream2000.ncbiRefSeq.maf.gz | awk '{print $2}' \
+      | sort | uniq -c | sort -rn | less
+
+    zgrep "^s " upstream5000.ncbiRefSeq.maf.gz | awk '{print $2}' \
+      | sort | uniq -c | sort -rn | less
 
     ######################################################################
     ## compress the maf files
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way
     mkdir maf
     time rsync -a -P ../../maf/ ./maf/
     # real    12m9.290s
 
     du -hscL maf/ ../../maf/
     #	141G    maf/
     #	141G    ../../maf/
 
     cd maf
     time gzip *.maf &
-XXX - running - Wed Dec 23 14:55:47 PST 2020
-    # real    81m10.239s
+    # about an hour
 
     du -hscL maf ../../maf/
-    #  18G     maf
+    #	16G     maf
+    #	141G    ../../maf/
 
     cd maf
-    md5sum *.maf.gz *.nh > md5sum.txt
+    md5sum *.maf.gz > md5sum.txt
 
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf
     cd maf
     ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/maf
     cd --
     ln -s `pwd`/*.maf.gz `pwd`/*.nh `pwd`/*.txt \
          /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/
 
     ###########################################################################
 
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/multiz35way
     grep TREE ../../4d/all.mod | awk '{print $NF}' \
       | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
          > mm39.35way.nh
 
@@ -1795,92 +1826,89 @@
        | sed -e "s#_x_#'#g; s#X__#X._#;" > mm39.35way.commonNames.nh
 
     sed -f ../../db.to.sciName.sed mm39.35way.nh \
            > mm39.35way.scientificNames.nh
 
     time md5sum *.nh *.maf.gz > md5sum.txt
     #   real    0m3.147s
 
     ln -s `pwd`/*.maf.gz `pwd`/*.nh \
         /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way
 
     du -hscL ./maf ../../maf
     #  18G     ./maf
     # 156G    ../../maf
 
-XXX
     # obtain the README.txt from danRer10/multiz12way and update for this
     #   situation
     ln -s `pwd`/*.txt \
          /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/multiz35way/
 
     #####################################################################
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way
 
     mkdir mm39.35way.phastCons
     cd mm39.35way.phastCons
     ln -s ../../../cons/all/downloads/*.wigFix.gz .
     md5sum *.gz > md5sum.txt
 
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phastCons35way
     ln -s ../../cons/all/phastCons35way.bw ./mm39.phastCons35way.bw
     ln -s ../../cons/all/all.mod ./mm39.phastCons35way.mod
     time md5sum *.mod *.bw > md5sum.txt
     #   real    0m20.354s
 
-XXX
-    # obtain the README.txt from mm39/phastCons20way and update for this
+    # obtain the README.txt from danRer10/phastCons12way and update for this
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons
     cd mm39.35way.phastCons
     ln -s `pwd`/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way/mm39.35way.phastCons
 
     cd ..
     #   situation
     ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phastCons35way
 
     #####################################################################
     cd /hive/data/genomes/mm39/bed/multiz35way/downloads/phyloP35way
 
     mkdir mm39.35way.phyloP
     cd mm39.35way.phyloP
 
     ln -s ../../../consPhyloP/all/downloads/*.wigFix.gz .
     md5sum *.wigFix.gz > md5sum.txt
 
     cd ..
 
     ln -s ../../consPhyloP/run.phyloP/all.mod mm39.phyloP35way.mod
     ln -s ../../consPhyloP/all/phyloP35way.bw mm39.phyloP35way.bw
 
     md5sum *.mod *.bw > md5sum.txt
 
-XXX 
-    # obtain the README.txt from mm39/phyloP20way and update for this
+    # obtain the README.txt from danRer10/phyloP12way and update for this
+    #   situation
     mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP
     cd mm39.35way.phyloP
     ln -s `pwd`/* \
 /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way/mm39.35way.phyloP
 
     cd ..
 
-    #   situation
     ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \
       /usr/local/apache/htdocs-hgdownload/goldenPath/mm39/phyloP35way
 
 #############################################################################
-# hgPal downloads (TBD - 2017-11-06 - Hiram)
+# hgPal downloads (DONE - 2020-12-23 - Hiram)
 #   FASTA from 35-way for ncbiRefSeq, refGene and knownCanonical
 
     ssh hgwdev
     screen -S mm39HgPal
     mkdir /hive/data/genomes/mm39/bed/multiz35way/pal
     cd /hive/data/genomes/mm39/bed/multiz35way/pal
     cat ../species.list | tr '[ ]' '[\n]' > order.list
 
     # ncbiRefSeq
     export mz=multiz35way
     export gp=ncbiRefSeq
     export db=mm39
     export I=0
     export D=0
     mkdir exonAA exonNuc
@@ -1919,66 +1947,70 @@
     # -rw-rw-r-- 1 1596566489 Dec 23 16:53 ncbiRefSeq.multiz35way.exonNuc.fa.gz
 
     export mz=multiz35way
     export gp=ncbiRefSeq
     export db=mm39
     export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments
     mkdir -p $pd
     md5sum *.fa.gz > md5sum.txt
     ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
     ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz
     ln -s `pwd`/md5sum.txt $pd/
 
     rm -rf exonAA exonNuc
 
 #############################################################################
-# wiki page for 35-way (TBD - 2017-11-06 - Hiram)
+# wiki page for 35-way (DONE - 2021-01-11 - Hiram)
     mkdir /hive/users/hiram/bigWays/mm39.35way
     cd /hive/users/hiram/bigWays
     echo "mm39" > mm39.35way/ordered.list
 awk '{print $1}' /hive/data/genomes/mm39/bed/multiz35way/35way.distances.txt \
        >> mm39.35way/ordered.list
 
     # sizeStats.sh catches up the cached measurements required for data
     # in the tables.  They are usually already mostly done, only new
     # assemblies will have updates.
     ./sizeStats.sh mm39.35way/ordered.list
     # dbDb.sh constructs mm39.35way/XenTro9_35-way_conservation_alignment.html
     # may need to add new assembly references to srcReference.list and
     # urlReference.list
+    # Updated this script to work with the GCF accession and finding relevant
+    # data from the assembly hub files
     ./dbDb.sh mm39 35way
+
     # sizeStats.pl constructs mm39.35way/XenTro9_35-way_Genome_size_statistics.html
     # this requires entries in coverage.list for new sequences
     ./sizeStats.pl mm39 35way
 
     # defCheck.pl constructs XenTro9_35-way_conservation_lastz_parameters.html
     ./defCheck.pl mm39 35way
 
     # this constructs the html pages in mm39.35way/:
-# -rw-rw-r-- 1 6247 May  2 17:07 XenTro9_35-way_conservation_alignment.html
-# -rw-rw-r-- 1 8430 May  2 17:09 XenTro9_35-way_Genome_size_statistics.html
-# -rw-rw-r-- 1 5033 May  2 17:10 XenTro9_35-way_conservation_lastz_parameters.html
+# -rw-rw-r-- 1 17957 Jan  4 17:56 Mm39_35-way_conservation_alignment.html
+# -rw-rw-r-- 1 11991 Jan  4 17:58 Mm39_35-way_conservation_lastz_parameters.html
+# -rw-rw-r-- 1 23297 Jan 11 14:58 Mm39_35-way_Genome_size_statistics.html
 
     # add those pages to the genomewiki.  Their page names are the
     # names of the .html files without the .html:
-#  XenTro9_35-way_conservation_alignment
-#  XenTro9_35-way_Genome_size_statistics
-#  XenTro9_35-way_conservation_lastz_parameters
+# Mm39_35-way_Genome_size_statistics.html
+# Mm39_35-way_conservation_alignment.html
+# Mm39_35-way_conservation_lastz_parameters.html
 
     # when you view the first one you enter, it will have links to the
     # missing two.
 
 ############################################################################
-# pushQ readmine (TBD - 2017-11-07 - Hiram)
+# pushQ readmine (TBD - 2021-01-11 - Hiram)
 
+XXX - ready - Mon Jan 11 16:30:45 PST 2021
   cd /usr/local/apache/htdocs-hgdownload/goldenPath/mm39
   find -L `pwd`/multiz35way `pwd`/phastCons35way `pwd`/phyloP35way \
 	/gbdb/mm39/multiz35way -type f \
     > /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList
   wc -l /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList
 # 1450 /hive/data/genomes/mm39/bed/multiz35way/downloads/redmine.20216.fileList
 
   cd /hive/data/genomes/mm39/bed/multiz35way/downloads
   hgsql -e 'show tables;' mm39 | grep 35way \
 	| sed -e 's/^/mm39./;' > redmine.20216.table.list
 
 ############################################################################