346748ca6fae57c82810e5ae94166c7f5547b940
hiram
  Mon Dec 21 15:39:19 2020 -0800
read to continue with gene annotation refs #26584

diff --git src/hg/makeDb/doc/mm39/multiz35way.txt src/hg/makeDb/doc/mm39/multiz35way.txt
index 30abe82..f4656c4 100644
--- src/hg/makeDb/doc/mm39/multiz35way.txt
+++ src/hg/makeDb/doc/mm39/multiz35way.txt
@@ -83,118 +83,134 @@
 #       (eriEur2:0.221785,
 #       sorAra2:0.269562):0.056393):0.021227):0.023664,
 #      (loxAfr3:0.098929,
 #      echTel2:0.245936):0.056717):0.234728,
 #     monDom5:0.285786):0.181168,
 #    galGal6:0.509442):0.05,
 #   xenTro9:0.347944):0.211354,
 #  danRer11:1.201072):0.2,
 # petMar3:0.975747);
 
     # extract species list from that .nh file
     sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
         mm39.35way.nh | xargs echo | sed 's/ //g; s/,/ /g' \
         | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt
 
-    # construct db to name translation list:
+    # construct db to name sed translation list:
+
     cat species.list.txt | while read DB
 do
 hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest
-done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \
-    | sed -e "s#'#_x_#g;" > db.to.name.txt
+done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \
+    | sed -e "s#'#_x_#g;" > db.to.name.sed
 
-# edited db.to.name.txt to change - to _ in some of the names.
-# e.g. Crab-eating -> Crab_eating,
-# the Crab-eating didn't survive the tree_doctor
+    printf "s/GCF_003668045.3/Chinese_hamster/;\n" >> db.to.name.sed
 
-/cluster/bin/phast/tree_doctor --rename "`cat db.to.name.txt`" mm39.35way.nh \
-   | sed -e 's/0\+)/)/g; s/0\+,/,/g' \
-     | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
-       | sed -e "s#_x_#'#g;" > mm39.35way.commonNames.nh
+    sed -f db.to.name.sed mm39.35way.nh | sed -e "s#_x_#'#g; s#X__#X._#;"\
+       > mm39.35way.commonNames.nh
 
     cat mm39.35way.commonNames.nh | sed -e 's/^/# /;'
-# ((((((((((((Human:0.00655,
+# ((((((((((((((Mouse:0.089509,
+#              Rat:0.096589):0.072773,
+#             Chinese_hamster:0.17):0.08015,
+#            Beaver:0.17):0.05,
+#           Squirrel:0.125468):0.022992,
+#          Guinea_pig:0.175779):0.025746,
+#         (Rabbit:0.114227,
+#         Pika:0.201069):0.101463):0.015313,
+#        ((((((((Human:0.00655,
 #               Chimp:0.00684):0.00122,
 #              Bonobo:0.00784):0.003,
-#          Gorilla:0.008964):0.009693,
-#         Orangutan:0.01894):0.003471,
-#        Gibbon:0.02227):0.01204,
-#       (((((Rhesus:0.003991,
-#           (Crab_eating_macaque:0.002991,
-#           Pig_tailed_macaque:0.005):0.001):0.001,
-#          Sooty_mangabey:0.008):0.005,
-#         Baboon:0.010042):0.01061,
-#        (Green_monkey:0.027,
-#        Drill:0.03):0.002):0.003,
-#       ((Proboscis_monkey:0.0007,
-#        Angolan_colobus:0.0008):0.0008,
-#       (Golden_snub:0.0007,
-#       Black_snub:0.0007):0.0008):0.018):0.02):0.02183,
-#      (((Marmoset:0.03,
-#        Squirrel_monkey:0.01035):0.00865,
-#       White_faced_sapajou:0.04):0.006,
-#      Ma's_night_monkey:0.04):0.005):0.05209,
+#             Gorilla:0.008964):0.025204,
+#            Rhesus:0.043601):0.02183,
+#           Marmoset:0.04965):0.05209,
 #          Tarsier:0.1114):0.02052,
-#    (((Mouse_lemur:0.0556,
-#      Coquerel's_sifaka:0.05):0.015,
-#     (Black_lemur:0.01,
-#     Sclater's_lemur:0.01):0.015):0.015,
-#    Bushbaby:0.1194):0.02052):0.015494,
-#   Mouse:0.356483):0.020593,
-#  Dog:0.165928):0.023664,
-# Armadillo:0.176526);
+#         Bushbaby:0.13992):0.013494,
+#        (Tree_shrew:0.136203,
+#        Malayan_flying_lemur:0.08):0.054937):0.002):0.020593,
+#       (((Pig:0.12,
+#         (Dolphin:0.064688,
+#         (Cow:0.11,
+#         Sheep:0.085):0.013592):0.045488):0.02,
+#        ((Horse:0.084397,
+#         Chinese_pangolin:0.05):0.017,
+#        (Dog:0.054458,
+#        Hawaiian_monk_seal:0.097):0.069845):0.008727):0.011671,
+#       (Hedgehog:0.221785,
+#       Shrew:0.269562):0.056393):0.021227):0.023664,
+#      (Elephant:0.098929,
+#      Tenrec:0.245936):0.056717):0.234728,
+#     Opossum:0.285786):0.181168,
+#    Chicken:0.509442):0.05,
+#   X._tropicalis:0.347944):0.211354,
+#  Zebrafish:1.201072):0.2,
+# Lamprey:0.975747);
 
 #	Use this specification in the phyloGif tool:
 #	http://genome.ucsc.edu/cgi-bin/phyloGif
 #	to obtain a png image for src/hg/htdocs/images/phylo/mm39_35way.png
 
-    ~/kent/src/hg/utils/phyloTrees/asciiTree.pl mm39.35way.nh > t.nh
-    ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \
+    cat species.list.txt | while read DB
+do
+hgsql -N -e "select name,scientificName from dbDb where name=\"${DB}\";" hgcentraltest
+done | sed -e 's#^#s/#;' | sed -e "s#\t#/#; s/ /_/g;" | sed -e 's#$#/;#' | sed -e 's/\./_/g' \
+    | sed -e "s#'#_x_#g;" > db.to.sciName.sed
+
+    printf "s/GCF_003668045.3/Cricetulus_griseus/;\n" >> db.to.sciName.sed
+
+    sed -f db.to.sciName.sed mm39.35way.nh > mm39.35way.scientificNames.nh
+
+       > mm39.35way.commonNames.nh
+
        | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \
           > mm39.35way.scientificNames.nh
-    rm -f t.nh
     cat mm39.35way.scientificNames.nh | sed -e 's/^/# /;'
-# ((((((((((((Homo_sapiens:0.00655,
+# ((((((((((((((Mus_musculus:0.089509,
+#              Rattus_norvegicus:0.096589):0.072773,
+#             Cricetulus_griseus:0.17):0.08015,
+#            Castor_canadensis:0.17):0.05,
+#           Spermophilus_tridecemlineatus:0.125468):0.022992,
+#          Cavia_porcellus:0.175779):0.025746,
+#         (Oryctolagus_cuniculus:0.114227,
+#         Ochotona_princeps:0.201069):0.101463):0.015313,
+#        ((((((((Homo_sapiens:0.00655,
 #               Pan_troglodytes:0.00684):0.00122,
 #              Pan_paniscus:0.00784):0.003,
-#          Gorilla_gorilla_gorilla:0.008964):0.009693,
-#         Pongo_pygmaeus_abelii:0.01894):0.003471,
-#        Nomascus_leucogenys:0.02227):0.01204,
-#       (((((Macaca_mulatta:0.003991,
-#           (Macaca_fascicularis:0.002991,
-#           Macaca_nemestrina:0.005):0.001):0.001,
-#          Cercocebus_atys:0.008):0.005,
-#         Papio_anubis:0.010042):0.01061,
-#        (Chlorocebus_sabaeus:0.027,
-#        Mandrillus_leucophaeus:0.03):0.002):0.003,
-#       ((Nasalis_larvatus:0.0007,
-#        Colobus_angolensis_palliatus:0.0008):0.0008,
-#       (Rhinopithecus_roxellana:0.0007,
-#       Rhinopithecus_bieti:0.0007):0.0008):0.018):0.02):0.02183,
-#      (((Callithrix_jacchus:0.03,
-#        Saimiri_boliviensis:0.01035):0.00865,
-#       Cebus_capucinus_imitator:0.04):0.006,
-#      Aotus_nancymaae:0.04):0.005):0.05209,
+#             Gorilla_gorilla_gorilla:0.008964):0.025204,
+#            Macaca_mulatta:0.043601):0.02183,
+#           Callithrix_jacchus:0.04965):0.05209,
 #          Tarsius_syrichta:0.1114):0.02052,
-#    (((Microcebus_murinus:0.0556,
-#      Propithecus_coquereli:0.05):0.015,
-#     (Eulemur_macaco:0.01,
-#     Eulemur_flavifrons:0.01):0.015):0.015,
-#    Otolemur_garnettii:0.1194):0.02052):0.015494,
-#   Mus_musculus:0.356483):0.020593,
-#  Canis_lupus_familiaris:0.165928):0.023664,
-# Dasypus_novemcinctus:0.176526);
+#         Otolemur_garnettii:0.13992):0.013494,
+#        (Tupaia_belangeri:0.136203,
+#        Galeopterus_variegatus:0.08):0.054937):0.002):0.020593,
+#       (((Sus_scrofa:0.12,
+#         (Tursiops_truncatus:0.064688,
+#         (Bos_taurus:0.11,
+#         Ovis_aries:0.085):0.013592):0.045488):0.02,
+#        ((Equus_caballus:0.084397,
+#         Manis_pentadactyla:0.05):0.017,
+#        (Canis_lupus_familiaris:0.054458,
+#        Neomonachus_schauinslandi:0.097):0.069845):0.008727):0.011671,
+#       (Erinaceus_europaeus:0.221785,
+#       Sorex_araneus:0.269562):0.056393):0.021227):0.023664,
+#      (Loxodonta_africana:0.098929,
+#      Echinops_telfairi:0.245936):0.056717):0.234728,
+#     Monodelphis_domestica:0.285786):0.181168,
+#    Gallus_gallus:0.509442):0.05,
+#   Xenopus_tropicalis:0.347944):0.211354,
+#  Danio_rerio:1.201072):0.2,
+# Petromyzon_marinus:0.975747);
 
     /cluster/bin/phast/all_dists mm39.35way.nh | grep mm39 \
         | sed -e "s/mm39.//" | sort -k2n > 35way.distances.txt
     #	Use this output to create the table below
     cat 35way.distances.txt | sed -e 's/^/# /;'
 # rn6   0.186098
 # GCF_003668045.3       0.332282
 # casCan1       0.412432
 # speTri2       0.417900
 # manPen1       0.485701
 # cavPor3       0.491203
 # galVar1       0.493420
 # calJac4       0.494237
 # gorGor6       0.500585
 # hg38  0.502391
@@ -211,73 +227,31 @@
 # loxAfr3       0.556386
 # oryCun2       0.556860
 # oviAri4       0.574054
 # neoSch1       0.585546
 # bosTau9       0.599054
 # ochPri3       0.643702
 # eriEur2       0.676481
 # echTel2       0.703393
 # sorAra2       0.724258
 # monDom5       0.921254
 # xenTro9       1.214580
 # galGal6       1.326078
 # petMar3       2.253737
 # danRer11      2.279062
 
-    printf '#!/usr/bin/env perl
-
-use strict;
-use warnings;
-
-open (FH, "<35way.distances.txt") or
-        die "can not read 35way.distances.txt";
-
-my $count = 0;
-while (my $line = <FH>) {
-    chomp $line;
-    my ($D, $dist) = split('"'"'\\s+'"'"', $line);
-    my $chain = "chain" . ucfirst($D);
-    my $B="/hive/data/genomes/mm39/bed/lastz.$D/fb.mm39." .
-        $chain . "Link.txt";
-    my $chainLinkMeasure =
-        `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`;
-    chomp $chainLinkMeasure;
-    $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1);
-    $chainLinkMeasure =~ s/\\%%//;
-    my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm39/fb.${D}.chainMm39Link.txt";
-    my $swapMeasure = "N/A";
-    if ( -s $swapFile ) {
-	$swapMeasure =
-	    `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`;
-	chomp $swapMeasure;
-	$swapMeasure = 0.0 if (length($swapMeasure) < 1);
-	$swapMeasure =~ s/\\%%//;
-    }
-    my $orgName=
-    `hgsql -N -e "select organism from dbDb where name='"'"'$D'"'"';" hgcentraltest`;
-    chomp $orgName;
-    if (length($orgName) < 1) {
-        $orgName="N/A";
-    }
-    ++$count;
-    printf "# %%02d  %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist,
-        $chainLinkMeasure, $swapMeasure, $orgName, $D;
-}
-close (FH);
-' > sizeStats.pl
-    chmod +x ./sizeStats.pl
-    ./sizeStats.pl
+    ~/kent/src/hg/makeDb/doc/mm39/sizeStats.pl
 
 #	If you can fill in all the numbers in this table, you are ready for
 #	the multiple alignment procedure
 # count phylo    chain      synNet    rBest  synNet v.  query
 #       dist      link                         chain
 # 001 0.1861 (% 70.901) (% 66.256) (% 65.494)  6.55 - Rat rn6
 # 002 0.3323 (% 58.000) (% 54.406) (% 00.000)  6.20 - Cricetulus griseus GCF_003668045.3
 # 003 0.4124 (% 36.600) (% 32.404) (% 34.407) 11.46 - Beaver casCan1
 # 004 0.4179 (% 34.147) (% 31.953) (% 32.373)  6.43 - Squirrel speTri2
 # 005 0.4857 (% 27.322) (% 22.622) (% 25.847) 17.20 - Chinese pangolin manPen1
 # 006 0.4912 (% 28.378) (% 26.311) (% 26.932)  7.28 - Guinea pig cavPor3
 # 007 0.4934 (% 35.714) (% 31.386) (% 33.751) 12.12 - Malayan flying lemur galVar1
 # 008 0.4942 (% 33.090) (% 31.297) (% 31.492)  5.42 - Marmoset calJac4
 # 009 0.5006 (% 35.064) (% 33.225) (% 33.360)  5.24 - Gorilla gorGor6
 # 010 0.5024 (% 35.372) (% 33.566) (% 33.646)  5.11 - Human hg38
@@ -688,46 +662,46 @@
     cd /hive/data/genomes/mm39/bed/multiz35way/maf
     # 97 of them have empty results, they have to be removed
     ls -ogrt | awk '$3 == 0' | awk '{print $NF}' | xargs rm -f
 
 
     # Load into database
     mkdir -p /gbdb/mm39/multiz35way/maf
     cd /hive/data/genomes/mm39/bed/multiz35way/maf
     ln -s `pwd`/*.maf /gbdb/mm39/multiz35way/maf/
 
     # this generates an immense multiz35way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     #   This is going to take all day.
     cd /dev/shm
     time hgLoadMaf -pathPrefix=/gbdb/mm39/multiz35way/maf mm39 multiz35way
-    # Loaded 40625470 mafs in 358 files from /gbdb/mm39/multiz35way/maf
-    # real    28m23.045s
+    # Loaded 44558775 mafs in 54 files from /gbdb/mm39/multiz35way/maf
+    # real    21m8.685s
 
     time (cat /gbdb/mm39/multiz35way/maf/*.maf \
         | hgLoadMafSummary -verbose=2 -minSize=30000 \
 	-mergeGap=1500 -maxSize=200000 mm39 multiz35waySummary stdin)
-#  Created 4568973 summary blocks from 850984320 components and 40625470 mafs from stdin
-#  real    49m52.561s
+# Created 7693860 summary blocks from 597609421 components and 44558775 mafs from stdin
+# real    28m44.150s
 
 
--rw-rw-r--   1 2171190193 Nov  2 16:40 multiz35way.tab
--rw-rw-r--   1  215756735 Nov  2 17:44 multiz35waySummary.tab
+# -rw-rw-r--   1 2269241117 Dec 19 15:05 multiz35way.tab
+# -rw-rw-r--   1  379606617 Dec 19 15:48 multiz35waySummary.tab
 
-    wc -l multiz30*.tab
-#    40625470 multiz35way.tab
-#     4568973 multiz35waySummary.tab
+    wc -l multiz35*.tab
+#   44558775 multiz35way.tab
+#    7693860 multiz35waySummary.tab
 
     rm multiz35way*.tab
 
 #######################################################################
 # GAP ANNOTATE MULTIZ30WAY MAF AND LOAD TABLES (TBD - 2017-11-03 - Hiram)
     # mafAddIRows has to be run on single chromosome maf files, it does not
     #	function correctly when more than one reference sequence
     #	are in a single file.
     mkdir -p /hive/data/genomes/mm39/bed/multiz35way/anno
     cd /hive/data/genomes/mm39/bed/multiz35way/anno
 
     # check for N.bed files everywhere:
     for DB in `sed -e 's/ GCF_003668045.3//;' ../species.list`
 do
     if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then
@@ -792,30 +766,31 @@
 # Submission to last job:           451s       7.52m     0.13h    0.01d
 
     du -hsc result
     #  156G    result
 
     # Load into database
     rm -f /gbdb/mm39/multiz35way/maf/*
     cd /hive/data/genomes/mm39/bed/multiz35way/anno/result
 
     ln -s `pwd`/*.maf /gbdb/mm39/multiz35way/maf/
 
     # this generates an immense multiz35way.tab file in the directory
     #	where it is running.  Best to run this over in scratch.
     cd /dev/shm
     time hgLoadMaf -pathPrefix=/gbdb/mm39/multiz35way/maf mm39 multiz35way
+XXX - running - Sat Dec 19 14:47:00 PST 2020
     # Loaded 40655883 mafs in 358 files from /gbdb/mm39/multiz35way/maf
     # real    37m27.075s
 
     # -rw-rw-r-- 1 2177747201 Nov  2 18:27 multiz35way.tab
 
     time (cat /gbdb/mm39/multiz35way/maf/*.maf \
         | hgLoadMafSummary -verbose=2 -minSize=30000 \
 	-mergeGap=1500 -maxSize=200000 mm39 multiz35waySummary stdin)
 # Created 4568973 summary blocks from 850984320 components and 40655883 mafs from stdin
 # real    59m27.383s
 
 # -rw-rw-r--   1 2177747201 Nov  2 18:27 multiz35way.tab
 # -rw-rw-r--   1  224894681 Nov  3 08:12 multiz35waySummary.tab
 
     wc -l multiz35way*.tab