src/hg/makeDb/doc/mm39/initialBuild.txt f6bb7fb8b0af020a2d602824f74d84d516bd96fb

f6bb7fb8b0af020a2d602824f74d84d516bd96fb
hiram
  Thu Jul 30 15:12:36 2020 -0700
liftOver mm10 to mm39 to mm10 completed refs #22271

diff --git src/hg/makeDb/doc/mm39/initialBuild.txt src/hg/makeDb/doc/mm39/initialBuild.txt
index 7659dfc..4c18312 100644
--- src/hg/makeDb/doc/mm39/initialBuild.txt
+++ src/hg/makeDb/doc/mm39/initialBuild.txt
@@ -80,50 +80,66 @@
     181 within_scaffold
 
 # And total size in gaps:
 zgrep -v "^#" *gaps.txt.gz | awk '{print $3-$2+1}' | ave stdin \
   | sed -e 's/^/# /;'
 # Q1 943.000000
 # median 50000.000000
 # Q3 68500.000000
 # average 212105.515850
 # min 10.000000
 # max 2890000.000000
 # count 347
 # total 73600614.000000
 # standard deviation 667296.516291
 
+     # survey the sequence to see if it has IUPAC characters:
+     time zgrep -v "^>" GCA_000001635.9_GRCm39_genomic.fna.gz \
+        | perl -ne '{print join("\n",split(//))}' \
+           | sed -e '/^$/d' | sort | uniq -c | sort -rn | sed -e 's/^/# /;'
+# 482676636 T
+# 482443877 A
+# 361138526 G
+# 361105901 C
+# 292069876 t
+# 291366772 a
+# 191917431 g
+# 191902764 c
+# 73600668 N
+
+# real    29m14.860s
+
 #############################################################################
 # establish config.ra file (DONE - 2020-07-27 - Hiram)
     cd /hive/data/genomes/mm39
     ~/kent/src/hg/utils/automation/prepConfig.pl mm39 mammal mouse \
        genbank/*_assembly_report.txt > mm39.config.ra
 
     # fix commonName:
 commonName House mouse
 to:
 commonName Mouse
     # fix orderKey:
 orderKey 8694
 to
 orderKey 268
     # fix assemblyLabel:
 assemblyLabel Genome Reference Consortium
 to
 assemblyLabel Genome Reference Consortium Mouse Build 39 (GCA_000001635.9)
 
-    # XXX THERE IS NO BIOSAMPLE !!!
+    # XXX THERE IS NO BIOSAMPLE !!!  (actually, there appear to be multiple)
 
     # compare with previous version to see if it is sane:
     diff mm39.config.ra ../mm10/mm10.config.ra
 
     # verify it really does look sane
     cat mm39.config.ra
 # Config parameters for makeGenomeDb.pl:
 db mm39
 clade mammal
 scientificName Mus musculus
 commonName Mouse
 assemblyDate Jun. 2020
 assemblyLabel Genome Reference Consortium Mouse Build 39 (GCA_000001635.9)
 assemblyShortLabel GRCm39
 orderKey 269
@@ -344,31 +360,31 @@
 
     # then finish it off:
     time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
        -fileServer=hgwdev -continue=db mm39.config.ra) > db.log 2>&1
     # real    14m40.115s
 
     # check in the trackDb files created in TemporaryTrackDbCheckout/
     #    and add mm39 to trackDb/makefile   refs #22271
     # fixing up the images reference to mm39.jpg
 
     # temporary symlink until masked sequence is available
     cd /hive/data/genomes/mm39
     ln -s `pwd`/mm39.unmasked.2bit /gbdb/mm39/mm39.2bit
 
 #############################################################################
-# verify gap table vs NCBI gap file (TBD - 2020-07-17 - Hiram)
+# verify gap table vs NCBI gap file (DONE - 2020-07-27 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/gap
     cd /hive/data/genomes/mm39/bed/gap
 
     zgrep -v "^#" ../../genbank/G*_gaps.txt.gz \
 	| awk '{printf "%s\t%d\t%d\t%s_%s\n", $1,$2-1,$3,$5,$6}' \
 	| sort -k1,1 -k2,2n > genbank.gap.bed
 
     # type survey:
     cut -f4 *.bed | sort | uniq -c | sed -e 's/^/# /;'
 #      60 between_scaffolds_na
 #      20 centromere_na
 #      21 short_arm_na
 #      42 telomere_na
 #       4 unknown_inferred_from_sequence
 #      19 unknown_unspecified
@@ -429,67 +445,72 @@
 
 #############################################################################
 # cytoBandIdeo - (DONE - 2020-07-27 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/cytoBand
     cd /hive/data/genomes/mm39/bed/cytoBand
     makeCytoBandIdeo.csh mm39
 
 #############################################################################
 # run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2020-07-27 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/idKeys
     cd /hive/data/genomes/mm39/bed/idKeys
 
     time (doIdKeys.pl \
         -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit \
         -buildDir=`pwd` mm39) > do.log 2>&1 &
-XXX - running - Mon Jul 27 15:15:44 PDT 2020
-    # real    3m22.298s
+    # real    0m45.175s
 
     cat mm39.keySignature.txt
-    #  174191aae5515d1114a9d6320b152b1a
+    #  804f78d880a5a7f049c472046b563601
 
 #############################################################################
 # gapOverlap (DONE - 2020-07-27 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/gapOverlap
     cd /hive/data/genomes/mm39/bed/gapOverlap
     time (doGapOverlap.pl \
         -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit mm39 ) \
         > do.log 2>&1 &
-XXX - running - Mon Jul 27 15:15:36 PDT 2020
-    # real    1m49.489s
+    # real    1m49.446s
 
-    # there only only nine:
+    # there is one only:
     wc -l bed.tab
-    # 9 bed.tab
+    # 1 bed.tab
     cut -f2- bed.tab
-chr1    41008264        41010364        chr1:41008265-41010364  1000    +      41008264 41010364        0       2       1000,1000       0,1100
-chr17   58049274        58051374        chr17:58049275-58051374 1000    +      58049274 58051374        0       2       1000,1000       0,1100
-... etc ...
-chrX    45160089        45162189        chrX:45160090-45162189  1000    +      45160089 45162189        0       2       1000,1000       0,1100
+chr6    47663669        47714277        chr6:47663670-47714277  304     +      47663669 47714277        0       2       304,304 0,50304
 
     cat fb.mm39.gapOverlap.txt
-    # 16158 bases of 2482000080 (0.001%) in intersection
+    # 608 bases of 2728222451 (0.000%) in intersection
 
 #############################################################################
 # tandemDups (DONE - 2020-07-27 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/tandemDups
     cd /hive/data/genomes/mm39/bed/tandemDups
     time (~/kent/src/hg/utils/automation/doTandemDup.pl \
   -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit mm39) \
         > do.log 2>&1 &
-XXX - running - Mon Jul 27 15:16:22 PDT 2020
-    # real    188m34.598s
+    # real    440m10.886s
+
+    # one job in pairedEnds needs more memory:
+    time ./runOne 29 20000 chrY tmp/chrY.bed.gz
+    # real    28m57.353s
+
+    # continuing
+    time (~/kent/src/hg/utils/automation/doTandemDup.pl \
+	-continue=collapsePairedEnds \
+	    -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit mm39) \
+        > collapsePairedEnds.log 2>&1 &
+XXX - running - Thu Jul 30 09:19:46 PDT 2020
 
     cat fb.mm39.tandemDups.txt
     # 155315479 bases of 3044872214 (5.101%) in intersection
 
     bigBedInfo mm39.tandemDups.bb | sed -e 's/^/#  /;'
 #  version: 4
 #  fieldCount: 13
 #  hasHeaderExtension: yes
 #  isCompressed: yes
 #  isSwapped: 0
 #  extraIndexCount: 0
 #  itemCount: 2,822,307
 #  primaryDataSize: 72,710,994
 #  primaryIndexSize: 292,560
 #  zoomLevels: 9
@@ -497,322 +518,427 @@
 #  basesCovered: 1,635,503,835
 #  meanDepth (of bases covered): 14.396921
 #  minDepth: 1.000000
 #  maxDepth: 381.000000
 #  std of depth: 29.341113
 
 #########################################################################
 # ucscToINSDC and ucscToRefSeq table/track (DONE - 2020-07-27 - Hiram)
     # construct idKeys for the genbank sequence
     mkdir /hive/data/genomes/mm39/genbank/idKeys
     cd /hive/data/genomes/mm39/genbank/idKeys
     faToTwoBit ../GCA_*m39_genomic.fna.gz mm39.genbank.2bit
 
     time (doIdKeys.pl -buildDir=`pwd` \
         -twoBit=`pwd`/mm39.genbank.2bit genbankMm39)  > do.log 2>&1 &
-    # real    3m30.599s
+    # real    0m45.317s
 
     cat genbankMm39.keySignature.txt
-    #  174191aae5515d1114a9d6320b152b1a
+    #  804f78d880a5a7f049c472046b563601
 
     mkdir /hive/data/genomes/mm39/bed/chromAlias
     cd /hive/data/genomes/mm39/bed/chromAlias
 
     join -t$'\t' ../idKeys/mm39.idKeys.txt \
         ../../genbank/idKeys/genbankMm39.idKeys.txt | cut -f2- \
           | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
             | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
                | sort -k1,1 -k2,2n > ucscToINSDC.bed
 
     # should be same line counts throughout:
     wc -l * ../../chrom.sizes
-    #   2198 ucscToINSDC.bed
-    #	2198 ../../chrom.sizes
+    #   61 ucscToINSDC.bed
+    #	61 ../../chrom.sizes
 
     export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1`
     echo $chrSize
-    # 23
+    # 22
     # use the $chrSize in this sed
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
          | hgLoadSqlTab mm39 ucscToINSDC stdin ucscToINSDC.bed
 
     # should be quiet for all OK
     checkTableCoords mm39
 
     # should cover %100 entirely:
     featureBits -countGaps mm39 ucscToINSDC
-    # 2482000080 bases of 2482000080 (100.000%) in intersection
+    # 2728222451 bases of 2728222451 (100.000%) in intersection
 
 #########################################################################
-# add chromAlias table (TBD - 2020-05-20 - Hiram)
+# add chromAlias table (DONE - 2020-07-27 - Hiram)
 
     mkdir /hive/data/genomes/mm39/bed/chromAlias
     cd /hive/data/genomes/mm39/bed/chromAlias
 
-    hgsql -N -e 'select chrom,name from ucscToRefSeq;' mm39 \
-        | sort -k1,1 > ucsc.refseq.tab
+    grep -v "^#" ../../genbank/GCA_000001635.9_GRCm39_assembly_report.txt \
+	| awk '{printf "%s\t%s\n", $5,$1}' | sort > ncbi.assembly.txt
+
     hgsql -N -e 'select chrom,name from ucscToINSDC;' mm39 \
         | sort -k1,1 > ucsc.genbank.tab
 
+    join -t$'\t' -1 2 <(sort -k2,2 ucsc.genbank.tab) ncbi.assembly.txt
+
+    # lookup the chrM sequence in the assembly to determine the RefSeq ID:
+    printf "chrM\tNC_005089.1\n" > ucsc.refseq.tab
+
     wc -l *.tab
-    #	2198 ucsc.genbank.tab
+    #  61 ucsc.assembly.tab
+    #  61 ucsc.genbank.tab
+    #   1 ucsc.refseq.tab
 
     ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
         > mm39.chromAlias.tab
+# working: assembly
+# working: genbank
 
-for t in genbank
+for t in assembly genbank
 do
   c0=`cat ucsc.$t.tab | wc -l`
   c1=`grep $t mm39.chromAlias.tab | wc -l`
   ok="OK"
   if [ "$c0" -ne "$c1" ]; then
      ok="ERROR"
   fi
   printf "# checking $t: $c0 =? $c1 $ok\n"
 done
-# checking genbank: 2198 =? 2198 OK
+# checking assembly: 61 =? 61 OK
+# checking genbank: 61 =? 61 OK
 
     # verify chrM is here properly:
     grep chrM mm39.chromAlias.tab 
-# CM022001.1      chrM    genbank
+# AY172335.1      chrM    genbank
+# MT      chrM    assembly
+# NC_005089.1     chrM    refseq
 
     hgLoadSqlTab mm39 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
         mm39.chromAlias.tab
 
 #########################################################################
-# fixup search rule for assembly track/gold table (TBD - 2020-07-17 - Hiram)
-    cd ~/kent/src/hg/makeDb/trackDb/dog/mm39
+# fixup search rule for assembly track/gold table (DONE - 2020-07-27 - Hiram)
+    cd ~/kent/src/hg/makeDb/trackDb/mouse/mm39
     # preview prefixes and suffixes:
     hgsql -N -e "select frag from gold;" mm39 \
-      | sed -e 's/[0-9_.]\+//;' | sort | uniq -c 
-   1037 CM
-    758 REHQ
-
-    # implies a rule: '[CR][ME][HQ0-9]+(\.[0-9_]+)?'
+      | sed -e 's/[0-9.]\+//;' | sort | uniq -c | sed -e 's/^/# /;'
+#   15228 AC
+#     816 AEKQ
+#       8 AEKR
+#       1 AF
+#    3876 AL
+#       1 AY
+#     844 BX
+#     191 CAAA
+#     135 CR
+#     684 CT
+#      63 CU
+#      37 FO
+#       3 FP
+#      29 FQ
+#      14 LO
+#     249 LXEJ
+#      30 MF
+#      44 MG
+#      18 MH
+#       2 MN
+
+    # implies a rule: '[ABCFLM][ACEFGHLNOPQRTUXY][AEKJQR0-9]+(\.[0-9_]+)?'
 
     # verify this rule will find them all and eliminate them all:
     hgsql -N -e "select frag from gold;" mm39 | wc -l
-    # 1795
+    # 22273
 
     hgsql -N -e "select frag from gold;" mm39 \
-       | egrep -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l
+       | egrep -e '[ABCFLM][ACEFGHLNOPQRTUXY][AEKJQR0-9]+(\.[0-9_]+)?' | wc -l
     # 1795
 
     hgsql -N -e "select frag from gold;" mm39 \
-       | egrep -v -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l
+      | egrep -v -e '[ABCFLM][ACEFGHLNOPQRTUXY][AEKJQR0-9]+(\.[0-9_]+)?' | wc -l
     # 0
 
     # hence, add to trackDb/rhesus/mm39/trackDb.ra
 searchTable gold
 shortCircuit 1
-termRegex [CR][ME][HQ0-9]+(\.[0-9_]+)?
+termRegex [ABCFLM][ACEFGHLNOPQRTUXY][AEKJQR0-9]+(\.[0-9_]+)?
 query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
 searchPriority 8
 
     # verify searches work in the position box
 
     git commit -m 'adding search rule for gold/assembly track refs #22271' \
        trackDb.ra
 
 ##########################################################################
-# running repeat masker (DONE - 2020-07-27 - Hiram)
+# running repeat masker (DONE - 2020-07-29 - Hiram)
+    # using new repeat masker version 4.1.0
     mkdir /hive/data/genomes/mm39/bed/repeatMasker
     cd /hive/data/genomes/mm39/bed/repeatMasker
     time  (doRepeatMasker.pl -buildDir=`pwd` \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         -smallClusterHub=ku mm39) > do.log 2>&1
-XXX - running - Mon Jul 27 14:48:56 PDT 2020
-    # real    293m51.353s
+    # real    1175m35.646s
 
     cat faSize.rmsk.txt
-# 2482000080 bases (58500 N's 2481941580 real 1403544550 upper
-#	1078397030 lower) in 2198 sequences in 1 files
-# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
-#	max 124992030 (chrX) median 43246
-# %43.45 masked total, %43.45 masked real
+# 2728222451 bases (73600668 N's 2654621783 real 1461962151 upper
+#	1192659632 lower) in 61 sequences in 1 files
+# Total size: mean 44724958.2 sd 64970951.3 min 1976 (chr4_JH584295v1_random)
+#	max 195154279 (chr1) median 182347
+# %43.72 masked total, %44.93 masked real
 
     egrep -i "versi|relea" do.log
-# RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $
-# grep version of RepeatMasker$ /hive/data/staging/data/RepeatMasker/RepeatMasker
-# February 01 2017 (open-4-0-8) 1.332 version of RepeatMasker
-# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
-# CC    Dfam_Consensus RELEASE 20181026;                            *
-# CC    RepBase RELEASE 20181026;                                   *
+# RepeatMasker version 4.1.0
+# CC    Artefacts RELEASE 20190301;
+# CC    Dfam RELEASE Dfam_3.1;
+
+    sed -e 's/^/# /;' versionInfo.txt 
+# The repeat files provided for this assembly were generated using RepeatMasker.
+#   Smit, AFA, Hubley, R & Green, P.,
+#   RepeatMasker Open-4.1.
+#   1996-2010 <http://www.repeatmasker.org>.
+# 
+# VERSION:
+# RepeatMasker version 4.1.0
+# Search Engine: Crossmatch [ 1.090518 ]
+# Master RepeatMasker Database: /hive/data/staging/data/RepeatMasker191030/Libraries/RepeatMaskerLib.embl ( Complete Database: CONS-Dfam_3.1 )
+# 
+# Building general libraries in: /hive/data/staging/data/RepeatMasker191030/Libraries/CONS-Dfam_3.1/general
+# Building species libraries in: /hive/data/staging/data/RepeatMasker191030/Libraries/CONS-Dfam_3.1/mus_musculus
+#    - 1259 ancestral and ubiquitous sequence(s) for mus musculus
+#    - 121 lineage specific sequence(s) for mus musculus
+# RepeatMasker version 4.1.0
+# CC    Artefacts RELEASE 20190301;                                 *
+# CC    Dfam RELEASE Dfam_3.1;                                      *
+# # RepeatMasker engine: -engine crossmatch -s
+# # RepeatMasker library options: -species 'Mus musculus'
+# 
+# PARAMETERS:
+# /hive/data/staging/data/RepeatMasker191030/RepeatMasker -engine crossmatch -s -align -species 'Mus musculus'
 
     time featureBits -countGaps mm39 rmsk
-    # 1078398935 bases of 2482000080 (43.449%) in intersection
-    # real    0m35.578s
+    # 1192661541 bases of 2728222451 (43.716%) in intersection
+    # real    0m24.596s
 
     # why is it different than the faSize above ?
     # because rmsk masks out some N's as well as bases, the faSize count above
     #   separates out the N's from the bases, it doesn't show lower case N's
 
     # faster way to get the same result on high contig count assemblies:
     time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' mm39 \
         | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
-    #  total 1078398935.000000
-    #  real    0m22.013s
+    # total 1192661541.000000
+    # real    0m22.917s
 
 ##########################################################################
 # running simple repeat (DONE - 2020-07-27 - Hiram)
 
     mkdir /hive/data/genomes/mm39/bed/simpleRepeat
     cd /hive/data/genomes/mm39/bed/simpleRepeat
     time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
         -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
         -trf409=6 mm39) > do.log 2>&1
-XXX - running - Mon Jul 27 14:49:35 PDT 2020
-    # real    7m53.400s
+    # real    78m39.043s
 
     cat fb.simpleRepeat
-    # 42156507 bases of 2337131234 (1.804%) in intersection
+    # 93129149 bases of 2654624157 (3.508%) in intersection
 
-XXX - ready for masking - 2020-07-17
     cd /hive/data/genomes/mm39
     # if using the Window Masker result:
     cd /hive/data/genomes/mm39
 #    twoBitMask bed/windowMasker/mm39.cleanWMSdust.2bit \
 #       -add bed/simpleRepeat/trfMask.bed  mm39.2bit
     #   you can safely ignore the warning about fields >= 13
 
     # add to rmsk after it is done:
     twoBitMask mm39.rmsk.2bit \
         -add bed/simpleRepeat/trfMask.bed mm39.2bit
     #   you can safely ignore the warning about fields >= 13
     twoBitToFa mm39.2bit stdout | faSize stdin > faSize.mm39.2bit.txt
     cat faSize.mm39.2bit.txt
-# 2482000080 bases (58500 N's 2481941580 real 1401386884 upper
-#	1080554696 lower) in 2198 sequences in 1 files
-# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
-#	max 124992030 (chrX) median 43246
-# %43.54 masked total, %43.54 masked real
+# 2728222451 bases (73600668 N's 2654621783 real 1460027726 upper
+#	1194594057 lower) in 61 sequences in 1 files
+# Total size: mean 44724958.2 sd 64970951.3 min 1976 (chr4_JH584295v1_random)
+#	max 195154279 (chr1) median 182347
+# %43.79 masked total, %45.00 masked real
+
 
     rm /gbdb/mm39/mm39.2bit
     ln -s `pwd`/mm39.2bit /gbdb/mm39/mm39.2bit
 
 #########################################################################
-# CREATE MICROSAT TRACK (TBD - 2020-03-31 - Hiram)
+# CREATE MICROSAT TRACK (DONE - 2020-07-27 - Hiram)
     ssh hgwdev
-    mkdir /cluster/data/mm39/bed/microsat
-    cd /cluster/data/mm39/bed/microsat
+    mkdir /hive/data/genomes/mm39/bed/microsat
+    cd /hive/data/genomes/mm39/bed/microsat
 
     awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
          ../simpleRepeat/simpleRepeat.bed > microsat.bed
 
     hgLoadBed mm39 microsat microsat.bed
-    # Read 65981 elements of size 4 from microsat.bed
+    # Read 197239 elements of size 4 from microsat.bed
 
 ##########################################################################
-## WINDOWMASKER (TBD - 2020-03-31 - Hiram)
+## WINDOWMASKER (DONE - 2020-07-28 - Hiram)
 
     mkdir /hive/data/genomes/mm39/bed/windowMasker
     cd /hive/data/genomes/mm39/bed/windowMasker
     time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
         -dbHost=hgwdev mm39) > do.log 2>&1
     # real    90m16.169s
 
     # Masking statistics
     cat faSize.mm39.cleanWMSdust.txt
 # 2482000080 bases (58500 N's 2481941580 real 1630728232 upper 851213348 lower)
 #	in 2198 sequences in 1 files
 # Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
 #	max 124992030 (chrX) median 43246
 # %34.30 masked total, %34.30 masked real
 
+    # completed before rmsk was done, to finish:
+    featureBits -countGaps mm39 rmsk windowmaskerSdust 2> fb.mm39.rmsk.windowmaskerSdust.txt
     cat fb.mm39.rmsk.windowmaskerSdust.txt
-    # 598271411 bases of 2482000080 (24.104%) in intersection
+    # 753903955 bases of 2728222451 (27.634%) in intersection
+
+    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
+        -continue=cleanup -dbHost=hgwdev mm39) > cleanup.log 2>&1
+    # real    1m7.841s
 
 ##########################################################################
-# cpgIslands - (TBD - 2020-04-02 - Hiram)
+# cpgIslands - (DONE - 2020-07-30 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/cpgIslands
     cd /hive/data/genomes/mm39/bed/cpgIslands
     time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
       -workhorse=hgwdev -smallClusterHub=ku mm39) > do.log 2>&1
+XXX - running - Thu Jul 30 09:20:26 PDT 2020
     # real    3m29.034s
 
     cat fb.mm39.cpgIslandExt.txt
     # 47618882 bases of 2481941580 (1.919%) in intersection
 
 ##############################################################################
-# genscan - (TBD - 2020-04-02 - Hiram)
+# genscan - (DONE - 2020-07-30 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/genscan
     cd /hive/data/genomes/mm39/bed/genscan
     time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
       -bigClusterHub=ku mm39) > do.log 2>&1
+XXX - running - Thu Jul 30 09:20:55 PDT 2020
     # real    8m19.775s
 
     # two jobs broken:
 ./runGsBig2M.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed &
 ./runGsBig2M.csh chr34 000 gtf/000/chr34.gtf pep/000/chr34.pep subopt/000/chr34.bed
 wait
     # real    14m27.845s
 
     time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
       -continue=makeBed -bigClusterHub=ku mm39) > makeBed.log 2>&1
     # real    0m45.365s
 
     cat fb.mm39.genscan.txt
     # 57650331 bases of 2481941580 (2.323%) in intersection
 
     cat fb.mm39.genscanSubopt.txt
     # 50129491 bases of 2481941580 (2.020%) in intersection
 
 #########################################################################
-# Create kluster run files (TBD - 2020-04-02 - Hiram)
+# Create kluster run files (DONE - 2020-07-30 - Hiram)
 
     # numerator is mm39 gapless bases "real" as reported by:
     featureBits -noRandom -noHap mm39 gap
-    # 36700 bases of 2353522726 (0.002%) in intersection
+    # 73490654 bases of 2649940489 (2.773%) in intersection
     #                      ^^^
 
     # denominator is hg19 gapless bases as reported by:
     #   featureBits -noRandom -noHap hg19 gap
     #     234344806 bases of 2861349177 (8.190%) in intersection
     # 1024 is threshold used for human -repMatch:
-    calc \( 2353522726 / 2861349177 \) \* 1024
-    #  ( 2353522726 / 2861349177 ) * 1024 = 842.262556
+    calc \( 2649940489 / 2861349177 \) \* 1024
+    #  ( 2649940489 / 2861349177 ) * 1024 = 948.342510
 
-    # ==> use -repMatch=800 according to size scaled down from 1024 for human.
+    # ==> use -repMatch=900 according to size scaled down from 1024 for human.
     #   and rounded down to nearest 50
     cd /hive/data/genomes/mm39
     time blat mm39.2bit \
          /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/mm39.11.ooc \
-        -repMatch=800
-    #	Wrote 34718 overused 11-mers to jkStuff/mm39.11.ooc
-    #	real    0m21.985s
-
-    # canFam3 at repMatch=900:
-    #   Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc
-    #	real    1m11.629s
-
-    #   there are no non-bridged gaps
-    hgsql -N \
-        -e 'select * from gap where bridge="no" order by size;' mm39 \
-
-    # HOWEVER, every gap in this assembly is the same 'within scaffold'
-    # at size 100:
-    hgsql -N -e 'select size from gap where bridge="yes" order by size;'
-     mm39  | sort | uniq -c
-    # 585 100
-
-    # using these gaps to make a lift file
-    # minimum gap size is 100 and produces a reasonable number of lifts
-    gapToLift -verbose=2 -minGap=100 mm39 jkStuff/mm39.nonBridged.lft \
-        -bedFile=jkStuff/mm39.nonBridged.bed
-    wc -l jkStuff/mm39.nonBri*
-    #	2198 jkStuff/mm39.nonBridged.bed
-    #	2198 jkStuff/mm39.nonBridged.lft
+        -repMatch=900
+    #	Wrote 31807 overused 11-mers to jkStuff/mm39.11.ooc
+    #	real    0m23.024s
+
+    # mm10 at repMatch=1000:
+    #	Wrote 27208 overused 11-mers to jkStuff/mm10.11.ooc
+    #	real    2m9.568s
+
+    # survey sizes of non-bridged gaps:
+    hgsql -N -e 'select size from gap where bridge="no" order by size;' \
+	mm39  | sort | uniq -c | sort -k2,2n | sed -e 's/^/# /;'
+#       1 8000
+#      21 10000
+#       2 30000
+#      43 50000
+#       3 60000
+#       1 61000
+#       2 63000
+#       1 66000
+#       1 71000
+#       1 81000
+#      42 100000
+#       1 140000
+#       1 174000
+#       1 300000
+#       1 350000
+#       1 500000
+#      20 2890000
+
+    # and survey the bridged gaps over 5,000 bases:
+    hgsql -N -e 'select size from gap where bridge="yes" and size > 4999;' \
+	mm39  | sort | uniq -c | sort -k2,2n | sed -e 's/^/# /;'
+#       2 5000
+#       1 7000
+#       1 15000
+#       1 15500
+#       1 16000
+#       1 18000
+#       1 18500
+#       1 19208
+#       1 20000
+#       1 25500
+#       1 30000
+#       1 49000
+#      44 50000
+#       1 79000
+#       2 100000
+#       1 135500
+#       1 145000
+#       1 166000
+#       1 200000
+#       1 222000
+#       1 225000
+#       1 285000
+#       1 295000
+#       3 300000
+#       1 360000
+#       1 425000
+#       1 430000
+#       1 522000
+
+    # use gap size of 5000 to construct a lift file:
+    gapToLift -allowBridged -verbose=2 -minGap=5000 mm39 \
+	jkStuff/mm39.5Kgaps.lft -bedFile=jkStuff/mm39.5Kgaps.bed
+    wc -l jkStuff/mm39.5Kgaps*
+    #	176 jkStuff/mm39.5Kgaps.bed
+    #	176 jkStuff/mm39.5Kgaps.lft
+
+    # to see the gaps used:
+    bedInvert.pl chrom.sizes jkStuff/mm39.5Kgaps.bed \
+	| cut -f4 | sort -n | uniq -c | less
 
 ########################################################################
 # lastz/chain/net swap human/hg38 (TBD - 2020-04-10 - Hiram)
 
     # original alignment
     cd /hive/data/genomes/hg38/bed/lastzMm39.2020-04-02
 
     cat fb.hg38.chainMm39Link.txt
     # 1549397508 bases of 3110768607 (49.808%) in intersection
     cat fb.hg38.chainSynMm39Link.txt
     # 1488468205 bases of 3110768607 (47.849%) in intersection
 
     time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
 	hg38 mm39) > rbest.log 2>&1 &
     # real    310m32.196s
@@ -869,104 +995,108 @@
     #	real    50m20.639s
 
     cat fb.mm39.chainMm10Link.txt
     #	772902855 bases of 2481941580 (31.141%) in intersection
     cat fb.mm39.chainSynMm10Link.txt
     #   737924732 bases of 2481941580 (29.732%) in intersection
 
     time (doRecipBest.pl -load -workhorse=hgwdev mm39 mm10 \
       -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 &
     # real    173m38.016s
 
     cat fb.mm39.chainRBest.Mm10.txt
     # 740357755 bases of 2481941580 (29.830%) in intersection
 
 ##############################################################################
-# GENBANK AUTO UPDATE (TBD - 2020-04-09 - Hiram)
+# GENBANK AUTO UPDATE (DONE - 2020-07-30 - Hiram)
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/genbank
     git pull
     # /cluster/data/genbank/data/organism.lst shows:
     # organism       mrnaCnt estCnt  refSeqCnt
-    # Canis latrans   2       0       0
-    # Canis lupus     36      0       0
-    # Canis lupus familiaris  3351    382644  1718
-    # Canis lupus laniger     2       0       0
-    # Canis lupus lupus       2       0       0
-    # Canis mesomelas 1       0       0
-    # Canis sp.       45      0       0
-
-    # the latrans is the Coyota, the mesomelas
-    # is the Black-backed jackal from Africa and the langier is the Tibetan wolf
-    # lupus lupus is the Eurasian wolf
-
-    # edit etc/genbank.conf to add mm39 just after canFam3
-
-# mm39 (German shepard - GCA_011100685.1 - UU_Cfam_GSD_1.0)
+    # Mus musculus    581990  4871398 37663
+    # Mus musculus albula     4       0       0
+    # Mus musculus bactrianus 4       0       0
+    # Mus musculus brevirostris       2       0       0
+    # Mus musculus castaneus  28      2       0
+    # Mus musculus domesticus 1703    70      0
+    # Mus musculus kobuvirus  2       0       0
+    # Mus musculus molossinus 38      0       0
+    # Mus musculus musculus   71      4       0
+    # Mus musculus musculus x M. m. castaneus 1       0       0
+    # Mus musculus papillomavirus type 1      10      0       0
+    # Mus musculus picornavirus       3       0       0
+    # Mus musculus wagneri    2       0       0
+
+    # edit etc/genbank.conf to add mm39 just after mm10
+
+# mm39 - (house mouse - GCA_000001635.9 - GRCm39)
 mm39.serverGenome = /hive/data/genomes/mm39/mm39.2bit
 mm39.ooc = /hive/data/genomes/mm39/jkStuff/mm39.11.ooc
-mm39.lift = /hive/data/genomes/mm39/jkStuff/mm39.nonBridged.lft
-mm39.align.unplacedChroms = chrUn_*
+mm39.lift = /hive/data/genomes/mm39/jkStuff/mm39.5Kgaps.lft
 mm39.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 mm39.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 mm39.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 mm39.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 mm39.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
-mm39.refseq.mrna.native.load = yes
+mm39.downloadDir = mm39
 mm39.refseq.mrna.xeno.load  = yes
-# DO NOT NEED genbank.mrna.xeno except for human, mouse
+mm39.refseq.mrna.xeno.loadDesc = yes
 mm39.genbank.mrna.xeno.load  = yes
-mm39.downloadDir = mm39
+mm39.genbank.mrna.blatTargetDb = yes
 mm39.upstreamGeneTbl = refGene
-mm39.perChromTables = no
+# mm39.mgc = yes
+# mm39.orfeome = yes
+# mm39.ccds.buildId = 21
+# mm39.upstreamMaf = multiz60way /hive/data/genomes/mm39/bed/multiz60way/species.list
 
     # verify the files specified exist before checking in the file:
   grep ^mm39 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og
-# -rw-rw-r-- 1 651703337 Apr  2 08:57 /hive/data/genomes/mm39/mm39.2bit
-# -rw-rw-r-- 1    138880 Apr  2 09:51 /hive/data/genomes/mm39/jkStuff/mm39.11.ooc
-# -rw-rw-r-- 1    139818 Apr  2 09:56 /hive/data/genomes/mm39/jkStuff/mm39.nonBridged.lft
+# -rw-rw-r-- 1    127236 Jul 30 09:23 /hive/data/genomes/mm39/jkStuff/mm39.11.ooc
+# -rw-rw-r-- 1      7714 Jul 30 09:50 /hive/data/genomes/mm39/jkStuff/mm39.5Kgaps.lft
+# -rw-rw-r-- 1 714181470 Jul 30 09:03 /hive/data/genomes/mm39/mm39.2bit
 
-    git commit -m "Added mm39 dog; refs #22271" etc/genbank.conf
+    git commit -m "Added mm39 mouse; refs #22271" etc/genbank.conf
     git push
 
     # update /cluster/data/genbank/:
     make etc-update
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     git pull
     # add mm39 to:
     #   etc/hgwdev.dbs etc/align.dbs
-    git commit -m "Added mm39 - dog refs #22271" etc/hgwdev.dbs etc/align.dbs
+    git commit -m "Added mm39 - mouse refs #22271" etc/hgwdev.dbs etc/align.dbs
     git push
     make etc-update
 
     # wait a few days for genbank magic to take place, the tracks will
     # appear
 
 #############################################################################
-# augustus gene track (TBD - 2020-04-10 - Hiram)
+# augustus gene track (DONE - 2020-07-30 - Hiram)
 
     mkdir /hive/data/genomes/mm39/bed/augustus
     cd /hive/data/genomes/mm39/bed/augustus
     time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
         -species=human -dbHost=hgwdev \
            -workhorse=hgwdev mm39) > do.log 2>&1
-    # real    74m39.734s
+    # real    119m8.866s
 
     cat fb.mm39.augustusGene.txt
-    # 49999966 bases of 2481941580 (2.015%) in intersection
+    # 49120541 bases of 2654624157 (1.850%) in intersection
 
 #########################################################################
 # ncbiRefSeq (TBD - 2019-11-20 - Hiram)
     ### XXX ### Not available on GCA/genbank assemblies
 
     mkdir /hive/data/genomes/mm39/bed/ncbiRefSeq
     cd /hive/data/genomes/mm39/bed/ncbiRefSeq
     # running step wise just to be careful
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       -bigClusterHub=ku -dbHost=hgwdev \
       -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
       refseq vertebrate_mammalian Gorilla_gorilla \
       GCA_008122165.1_Kamilah_GGO_v0 mm39) > download.log 2>&1
     # real    1m37.523s
 
@@ -991,79 +1121,84 @@
     # to the gorilla/mm39/trackDb.ra to turn on the track in the browser
 
     # XXX 2019-11-20 - ready for this after genbank runs
 
     featureBits -enrichment mm39 refGene ncbiRefSeq 
  # refGene 0.402%, ncbiRefSeq 3.148%, both 0.402%, cover 99.90%, enrich 31.73x
     featureBits -enrichment mm39 ncbiRefSeq refGene
  # ncbiRefSeq 3.148%, refGene 0.402%, both 0.402%, cover 12.76%, enrich 31.73x
 
     featureBits -enrichment mm39 ncbiRefSeqCurated refGene
  # ncbiRefSeqCurated 0.401%, refGene 0.402%, both 0.400%, cover 99.66%, enrich 247.79x
 
     featureBits -enrichment mm39 refGene ncbiRefSeqCurated
  # refGene 0.402%, ncbiRefSeqCurated 0.401%, both 0.400%, cover 99.33%, enrich 247.79x
 
-#########################################################################
-# LIFTOVER TO canFam3 (TBD - 2020-04-02 - Hiram)
+##############################################################################
+# LIFTOVER TO mm10 (DONE - 2020-07-30 - Hiram)
     ssh hgwdev
-    mkdir /hive/data/genomes/mm39/bed/blat.canFam3.2020-04-02
-    cd /hive/data/genomes/mm39/bed/blat.canFam3.2020-04-02
+    mkdir /hive/data/genomes/mm39/bed/blat.mm10.2020-07-30
+    cd /hive/data/genomes/mm39/bed/blat.mm10.2020-07-30
     doSameSpeciesLiftOver.pl -verbose=2 \
         -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -query2Bit=/hive/data/genomes/mm10/mm10.2bit \
+        -querySizes=/hive/data/genomes/mm10/chrom.sizes \
         -ooc=/hive/data/genomes/mm39/jkStuff/mm39.11.ooc \
-         mm39 canFam3
+         mm39 mm10
     time (doSameSpeciesLiftOver.pl -verbose=2 \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -query2Bit=/hive/data/genomes/mm10/mm10.2bit \
+        -querySizes=/hive/data/genomes/mm10/chrom.sizes \
         -ooc=/hive/data/genomes/mm39/jkStuff/mm39.11.ooc \
-         mm39 canFam3) > doLiftOverToCanFam3.log 2>&1
-    # real    1100m17.743s
+         mm39 mm10) > doLiftOverToMm10.log 2>&1
+    # real    257m18.898s
 
-    # see if the liftOver menus function in the browser from mm39 to canFam3
+    # see if the liftOver menus function in the browser from mm39 to mm10
 
-#########################################################################
+##############################################################################
 #  BLATSERVERS ENTRY (TBD - 2020-04-02 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
     ssh hgwdev
 
     hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm39", "blat1b", "17904", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("mm39", "blat1b", "17905", "0", "1");' \
 	    hgcentraltest
     #	test it with some sequence
 
 ############################################################################
 ## reset default position to gene: CDH2 upon recommendation from Kerstin
 ##  (TBD - 2020-06-22 - Hiram)
 
     ssh hgwdev
     hgsql -e 'update dbDb set defaultPos="chr7:60683331-61003907"
 	where name="mm39";' hgcentraltest
 
 ##############################################################################
-# crispr whole genome (TBD - 2020-04-09 - Hiram)
+# crispr whole genome (DONE - 2020-07-30 - Hiram)
     mkdir /hive/data/genomes/mm39/bed/crisprAll
     cd /hive/data/genomes/mm39/bed/crisprAll
 
     # the large shoulder argument will cause the entire genome to be scanned
     # this takes a while for a new genome to get the bwa indexing done
     time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \
-    mm39 genscan -shoulder=250000000 -tableName=crisprAll \
+    mm39 augustusGene -shoulder=250000000 -tableName=crisprAll \
     -fileServer=hgwdev \
     -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
-      -workhorse=hgwdev) > ranges.log 2>&1
+      -workhorse=hgwdev) >> ranges.log 2>&1
+XXX - running - Thu Jul 30 10:12:03 PDT 2020
     # real    1m16.539s
 
     time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
        -continue=guides -stop=specScores mm39 genscan \
 	-shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \
     -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
       -workhorse=hgwdev) > specScores.log 2>&1
     # real    6558m26.295s
 
     cat guides/run.time | sed -e 's/^/# /;'
 # Completed: 100 of 100 jobs
 # CPU time in finished jobs:      11979s     199.66m     3.33h    0.14d  0.000 y
 # IO & Wait Time:                   251s       4.18m     0.07h    0.00d  0.000 y
 # Average job time:                 122s       2.04m     0.03h    0.00d
 # Longest finished job:             289s       4.82m     0.08h    0.00d
@@ -1142,31 +1277,30 @@
     # when clean, check in:
     git commit -m 'adding rules for mm39 refs #22271' all.joiner
     git push
     # run up a 'make alpha' in hg/hgTables to get this all.joiner file
     # into the hgwdev/genome-test system
 
     cd /hive/data/genomes/mm39
     time (makeDownloads.pl mm39) > downloads.log 2>&1
     #  real    16m11.233s
 
     #   now ready for pushQ entry
     mkdir /hive/data/genomes/mm39/pushQ
     cd /hive/data/genomes/mm39/pushQ
  time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList mm39) > mm39.pushQ.sql 2> stderr.out
     # real    15m2.385s
-XXXX
 
     # remove the tandemDups and gapOverlap from the file list:
     sed -i -e "/tandemDups/d" redmine.mm39.table.list
     sed -i -e "/Tandem Dups/d" redmine.mm39.releaseLog.txt
     sed -i -e "/gapOverlap/d" redmine.mm39.table.list
     sed -i -e "/Gap Overlaps/d" redmine.mm39.releaseLog.txt
 
     #   check for errors in stderr.out, some are OK, e.g.:
   # WARNING: mm39 does not have ucscToRefSeq
   # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqVersion.txt
   # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.bb
   # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.ix
   # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.ixx
   # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/seqNcbiRefSeq.rna.fa
   # WARNING: mm39 does not have seq