src/hg/makeDb/doc/canFam5/initialBuild.txt 571f0f6a9f2eedaa14524d60a93deecb0e3c763e

571f0f6a9f2eedaa14524d60a93deecb0e3c763e
hiram
  Fri Jul 17 17:09:07 2020 -0700
starting canFam5 build refs #25917

diff --git src/hg/makeDb/doc/canFam5/initialBuild.txt src/hg/makeDb/doc/canFam5/initialBuild.txt
new file mode 100644
index 0000000..f4384cf
--- /dev/null
+++ src/hg/makeDb/doc/canFam5/initialBuild.txt
@@ -0,0 +1,1135 @@
+# for emacs: -*- mode: sh; -*-
+
+# This file describes browser build for the canFam5
+#	GCA_005444595.1_UMICH_Zoey_3.1
+
+#  Can use existing photograph (otherwise find one before starting here)
+
+#########################################################################
+#  Initial steps, reuse existing photograph (DONE - 2020-07-17 - Hiram)
+
+# To start this initialBuild.txt document, from a previous assembly document:
+
+mkdir ~/kent/src/hg/makeDb/doc/canFam5
+cd ~/kent/src/hg/makeDb/doc/canFam5
+
+sed -e 's/Fam4/Fam5/g; s/DONE/TBD/g;' \
+   ../canFam4/initialBuild.txt > initialBuild.txt
+
+
+mkdir -p /hive/data/genomes/canFam5/genbank
+cd /hive/data/genomes/canFam5
+
+mkdir -p /hive/data/genomes/canFam5/photo
+cd /hive/data/genomes/canFam5/photo
+
+# Using the photo of Zoey from assembly hub:
+wget --timestamping 'https://raw.githubusercontent.com/KiddLab/zoey_genome_hub/master/zoey2.3/zoey-image-working-lowres-01.png'
+convert -quality 80 zoey-image-working-lowres-01.png canFam5.jpg
+
+cd /hive/data/genomes/canFam5
+printf "photoCreditURL\thttps://genome.med.umich.edu/kidd-lab/
+photoCreditName\tLinda Gates
+" > photoReference.txt
+
+## download from NCBI
+cd /hive/data/genomes/canFam5/genbank
+
+time rsync -L -a -P --stats \
+rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/005/444/595/GCA_005444595.1_UMICH_Zoey_3.1/ ./
+
+# sent 2,018 bytes  received 2,539,028,840 bytes  20,726,782.51 bytes/sec
+# total size is 2,538,401,806  speedup is 1.00
+
+# real    2m1.721s
+
+# this information is from the top of 
+#    canFam5/genbank/*_assembly_report.txt
+#    (aka: canFam5/genbank/GCA_005444595.1_UMICH_Zoey_3.1_assembly_report.txt
+
+# Assembly name:  UMICH_Zoey_3.1
+# Organism name:  Canis lupus familiaris (dog)
+# Infraspecific name:  breed=Great Dane
+# Isolate:  Zoey
+# Sex:  female
+# Taxid:          9615
+# BioSample:      SAMN04851098
+# BioProject:     PRJNA318403
+# Submitter:      University of Michigan
+# Date:           2019-05-30
+# Assembly type:  haploid (principal pseudohaplotype of diploid)
+# Release type:   major
+# Assembly level: Chromosome
+# Genome representation: full
+# WGS project:    REHQ01
+# Assembly method: FALCON-Unzip v. 1.7.7
+# Expected final version: yes
+# Reference guided assembly: GCA_000002285.2
+# Genome coverage: 50.0x
+# Sequencing technology: PacBio RSII
+# GenBank assembly accession: GCA_005444595.1
+# Linked assembly: GCA_005446665.1 (alternate pseudohaplotype of diploid)
+#
+## Assembly-Units:
+## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
+## GCA_005444745.1              Primary Assembly
+## GCA_005444775.1              non-nuclear
+
+# check assembly size for later reference:
+
+faSize G*1_genomic.fna.gz
+
+# 2343218756 bases (6087522 N's 2337131234 real 1588083192 upper
+#	749048042 lower) in 794 sequences in 1 files
+# Total size: mean 2951157.1 sd 13874454.0 min 1091 (REHQ01000052.1)
+#	max 122894117 (CM016569.1) median 13386
+# %31.97 masked total, %32.05 masked real
+
+# Survey types of gaps:
+
+zcat *gaps.txt.gz | cut -f5 | sort | uniq -c
+      1 gap_type
+    999 within_scaffold
+
+# And total size in gaps:
+zcat *gaps.txt.gz | grep -v "^#" | awk '{print $3-$2+1}' | ave stdin \
+  | sed -e 's/^/# /;'
+# Q1 100.000000
+# median 5000.000000
+# Q3 5000.000000
+# average 6093.603604
+# min 19.000000
+# max 144464.000000
+# count 999
+# total 6087510.000000
+# standard deviation 11823.465922
+
+#############################################################################
+# establish config.ra file (DONE - 2020-07-17 - Hiram)
+    cd /hive/data/genomes/canFam5
+    ~/kent/src/hg/utils/automation/prepConfig.pl canFam5 mammal dog \
+       genbank/*_assembly_report.txt > canFam5.config.ra
+
+    # compare with previous version to see if it is sane:
+    diff canFam5.config.ra ../canFam4/canFam4.config.ra
+
+    # verify it really does look sane
+    cat canFam5.config.ra
+# config parameters for makeGenomeDb.pl:
+db canFam5
+clade mammal
+scientificName Canis lupus familiaris
+commonName Dog
+assemblyDate May 2019
+assemblyLabel University of Michigan
+assemblyShortLabel UMICH_Zoey_3.1
+orderKey 4661
+# mitochondrial sequence included in refseq release
+# mitoAcc CM016608.1
+mitoAcc none
+fastaFiles /hive/data/genomes/canFam5/ucsc/*.fa.gz
+agpFiles /hive/data/genomes/canFam5/ucsc/*.agp
+# qualFiles none
+dbDbSpeciesDir dog
+photoCreditURL  https://genome.med.umich.edu/kidd-lab/
+photoCreditName Linda Gates
+ncbiGenomeId 85
+ncbiAssemblyId 3218611
+ncbiAssemblyName UMICH_Zoey_3.1
+ncbiBioProject 318403
+ncbiBioSample SAMN04851098
+genBankAccessionID GCA_005444595.1
+taxId 9615
+
+#############################################################################
+# setup UCSC named files (DONE - 2020-07-171 - Hiram)
+
+    mkdir /hive/data/genomes/canFam5/ucsc
+    cd /hive/data/genomes/canFam5/ucsc
+
+    # check for duplicate sequences:
+    time faToTwoBit -noMask ../genbank/G*1_genomic.fna.gz genbank.2bit
+    #  real    0m33.050s
+
+    twoBitDup genbank.2bit
+    # no output is a good result, otherwise, would have to eliminate duplicates
+    # the scripts creating the fasta here will be using this genbank.2bit file
+    # remove it later
+
+    # compare gaps with what the gaps.gz file reported:
+    twoBitInfo -nBed genbank.2bit  genbank.gap.bed
+    awk '{print $3-$2}' *.gap.bed | ave stdin | sed -e 's/^/# /;'
+# Q1 100.000000
+# median 5000.000000
+# Q3 5000.000000
+# average 6081.440559
+# min 4.000000
+# max 144464.000000
+# count 1001
+# total 6087522.000000
+# standard deviation 11814.767347
+    # comparing with above, there are 12 bases here that are not
+    # counted in the NCBI gaps file.  See what the AGP says later on here.
+
+    time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \
+      ../genbank/G*1_genomic.fna.gz \
+	../genbank/*_assembly_structure/Primary_Assembly
+CM016569.1 chr1
+CM016570.1 chr2
+CM016571.1 chr3
+CM016572.1 chr4
+CM016573.1 chr5
+CM016574.1 chr6
+CM016575.1 chr7
+CM016576.1 chr8
+CM016577.1 chr9
+CM016578.1 chr10
+CM016579.1 chr11
+CM016580.1 chr12
+CM016581.1 chr13
+CM016582.1 chr14
+CM016583.1 chr15
+CM016584.1 chr16
+CM016585.1 chr17
+CM016586.1 chr18
+CM016587.1 chr19
+CM016588.1 chr20
+CM016589.1 chr21
+CM016590.1 chr22
+CM016591.1 chr23
+CM016592.1 chr24
+CM016593.1 chr25
+CM016594.1 chr26
+CM016595.1 chr27
+CM016596.1 chr28
+CM016597.1 chr29
+CM016598.1 chr30
+CM016599.1 chr31
+CM016600.1 chr32
+CM016601.1 chr33
+CM016602.1 chr34
+CM016603.1 chr35
+CM016604.1 chr36
+CM016605.1 chr37
+CM016606.1 chr38
+CM016607.1 chrX
+
+real    9m9.307s
+
+    time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \
+       ../genbank/*_assembly_structure/Primary_Assembly
+    # processed 754 sequences into chrUn.fa.gz
+    # real    0m7.572s
+
+    # there are no unlocalized in this assembly
+    time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \
+       ../genbank/*_assembly_structure/Primary_Assembly
+
+    # bash syntax here
+    mitoAcc=`grep "^# mitoAcc" ../canFam5.config.ra | awk '{print $NF}'`
+    printf "# mitoAcc %s\n" "$mitoAcc"
+# mitoAcc CM016608.1
+
+    zcat \
+  ../genbank/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \
+     | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp
+
+    cat chrM.agp
+# chrM    1       16756   1       O       REHQ01000040.1  1       16756   +
+    printf ">chrM\n" > chrM.fa
+    twoBitToFa -noMask genbank.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa
+    gzip chrM.fa
+
+    faSize chrM.fa.gz
+# 16756 bases (0 N's 16756 real 16756 upper 0 lower) in 1 sequences in 1 files
+
+    # verify fasta and AGPs agree
+    time faToTwoBit *.fa.gz test.2bit
+    # real    0m47.200s
+
+    cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4
+    # All AGP and FASTA entries agree - both files are valid
+
+    # and no sequence lost from orginal:
+    twoBitToFa test.2bit stdout | faSize stdin
+# 2343218756 bases (6087522 N's 2337131234 real 2337131234 upper 0 lower)
+#	in 794 sequences in 1 files
+# Total size: mean 2951157.1 sd 13874454.0 min 1091 (chrUn_REHQ01000052v1)
+#	max 122894117 (chr1) median 13386
+
+    # same numbers as above (except for upper/lower masking)
+# 2343218756 bases (6087522 N's 2337131234 real 1588083192 upper
+#	749048042 lower) in 794 sequences in 1 files
+
+    # Verify these AGP files define all the gaps:
+    zgrep -w scaffold *.agp | awk '{print $3-$2+1}' | ave stdin
+# No numerical data column 1 of stdin
+
+    # a chromosome to accession name correspondence can be extracted
+    # from these single line agp files:
+    zgrep -h -v "^#" chr*.agp | cut -f1,6 | sort > ucsc.ncbi.name.equivalence
+    # unfortunately, that is only one type of name correspondence.
+    # there are other names in the assembly report:
+    grep -v "^#" \
+     ../genbank/GCA_005444595.1_UMICH_Zoey_3.1_assembly_report.txt \
+      | awk '{printf "%s\t%s\n", $1,$5}' | sort > ncbi.assembly.name.equivalence
+    # some of those will match also.  Make up a sed command file with
+    # the two different types of names:
+    join -t$'\t' ucsc.ncbi.name.equivalence ncbi.assembly.name.equivalence \
+       | awk '{printf "s/%s/%s/;\n", $3,$1}' > ncbi.ucsc.sed
+    join -v1 -t$'\t' ucsc.ncbi.name.equivalence \
+        ncbi.assembly.name.equivalence \
+           | awk '{printf "s/%s/%s/;\n", $2, $1}' >> ncbi.ucsc.sed
+
+    # these AGP files define no gaps.  What types are there:
+    zgrep -v "^#" \
+       ../genbank/GCA_005444595.1_UMICH_Zoey_3.1_genomic_gaps.txt.gz \
+          | awk '{print $5}' | sort | uniq -c
+#    999 within_scaffold
+
+    # since they are all classified as within scaffold, we can make fake AGP
+    # with just 'contig' gaps.  Using the NCBI names from genbank.2bit,
+    # and translating the first column to the UCSC name:
+    twoBitToFa genbank.2bit stdout \
+       | hgFakeAgp -minContigGap=1 -minScaffoldGap=200000 -singleContigs \
+          stdin stdout | sed -f ncbi.ucsc.sed > canFam5.fake.agp
+
+    # verify this AGP file functions correctly:
+    checkAgpAndFa canFam5.fake.agp test.2bit 2>&1 | tail -4
+    
+    # no longer need these temporary 2bit files
+    rm test.2bit refseq.2bit genbank.2bit genbank.gap.bed
+
+    # Reset the AGP specification in canFam5.config.ra
+agpFiles /hive/data/genomes/canFam5/ucsc/canFam5.fake.agp
+
+#############################################################################
+#  Initial database build (DONE - 2020-07-17 - Hiram)
+
+    # verify sequence and AGP are OK:
+    cd /hive/data/genomes/canFam5
+    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
+         -stop=agp canFam5.config.ra) > agp.log 2>&1
+    # real    1m57.586s
+
+    # then finish it off:
+    time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
+       -fileServer=hgwdev -continue=db canFam5.config.ra) > db.log 2>&1
+    # real    12m45.920s
+
+    # check in the trackDb files created in TemporaryTrackDbCheckout/
+    #    and add canFam5 to trackDb/makefile   refs #25917
+    # fixing up the images reference to canFam5.jpg
+
+    # temporary symlink until masked sequence is available
+    cd /hive/data/genomes/canFam5
+    ln -s `pwd`/canFam5.unmasked.2bit /gbdb/canFam5/canFam5.2bit
+
+#############################################################################
+# verify gap table vs NCBI gap file (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/gap
+    cd /hive/data/genomes/canFam5/bed/gap
+
+    zgrep -v "^#" ../../genbank/G*_gaps.txt.gz \
+	| awk '{printf "%s\t%d\t%d\t%s_%s\n", $1,$2-1,$3,$5,$6}' \
+	| sort -k1,1 -k2,2n > genbank.gap.bed
+
+    # type survey:
+    cut -f4 *.bed | sort | uniq -c
+#    274 within_scaffold_align_genus
+#    725 within_scaffold_paired-ends
+
+    # how much defined by NCBI:
+    awk '{print $3-$2}' *.bed | ave stdin | grep -w total
+    # total 6087510.000000
+
+    # how much in the gap table:
+    hgsql -e 'select * from gap;' canFam5 | awk '{print $4-$3}' \
+	| ave stdin | grep -w total
+    # total 6087522.000000
+
+    # an extra 12 marked in the UCSC AGP file
+
+##############################################################################
+# cpgIslands on UNMASKED sequence (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/cpgIslandsUnmasked
+    cd /hive/data/genomes/canFam5/bed/cpgIslandsUnmasked
+
+    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
+       -tableName=cpgIslandExtUnmasked \
+          -maskedSeq=/hive/data/genomes/canFam5/canFam5.unmasked.2bit \
+             -workhorse=hgwdev -smallClusterHub=ku canFam5) > do.log 2>&1
+    # real    3m30.591s
+
+    cat fb.canFam5.cpgIslandExtUnmasked.txt
+    # 56535294 bases of 2481941580 (2.278%) in intersection
+
+#############################################################################
+# cytoBandIdeo - (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/cytoBand
+    cd /hive/data/genomes/canFam5/bed/cytoBand
+    makeCytoBandIdeo.csh canFam5
+
+#############################################################################
+# run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/idKeys
+    cd /hive/data/genomes/canFam5/bed/idKeys
+
+    time (doIdKeys.pl \
+        -twoBit=/hive/data/genomes/canFam5/canFam5.unmasked.2bit \
+        -buildDir=`pwd` canFam5) > do.log 2>&1 &
+XXX - running - Fri Jul 17 17:01:13 PDT 2020
+    # real    3m22.298s
+
+    cat canFam5.keySignature.txt
+    #  174191aae5515d1114a9d6320b152b1a
+
+#############################################################################
+# gapOverlap (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/gapOverlap
+    cd /hive/data/genomes/canFam5/bed/gapOverlap
+    time (doGapOverlap.pl \
+        -twoBit=/hive/data/genomes/canFam5/canFam5.unmasked.2bit canFam5 ) \
+        > do.log 2>&1 &
+XXX - running - Fri Jul 17 16:56:55 PDT 2020
+    # real    1m49.489s
+
+    # there only only nine:
+    wc -l bed.tab
+    # 9 bed.tab
+    cut -f2- bed.tab
+chr1    41008264        41010364        chr1:41008265-41010364  1000    +      41008264 41010364        0       2       1000,1000       0,1100
+chr17   58049274        58051374        chr17:58049275-58051374 1000    +      58049274 58051374        0       2       1000,1000       0,1100
+... etc ...
+chrX    45160089        45162189        chrX:45160090-45162189  1000    +      45160089 45162189        0       2       1000,1000       0,1100
+
+    cat fb.canFam5.gapOverlap.txt
+    # 16158 bases of 2482000080 (0.001%) in intersection
+
+#############################################################################
+# tandemDups (TBD - 2020-03-31 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/tandemDups
+    cd /hive/data/genomes/canFam5/bed/tandemDups
+    time (~/kent/src/hg/utils/automation/doTandemDup.pl \
+  -twoBit=/hive/data/genomes/canFam5/canFam5.unmasked.2bit canFam5) \
+        > do.log 2>&1 &
+XXX - running - Fri Jul 17 16:57:18 PDT 2020
+    # real    188m34.598s
+
+    cat fb.canFam5.tandemDups.txt
+    # 155315479 bases of 3044872214 (5.101%) in intersection
+
+    bigBedInfo canFam5.tandemDups.bb | sed -e 's/^/#  /;'
+#  version: 4
+#  fieldCount: 13
+#  hasHeaderExtension: yes
+#  isCompressed: yes
+#  isSwapped: 0
+#  extraIndexCount: 0
+#  itemCount: 2,822,307
+#  primaryDataSize: 72,710,994
+#  primaryIndexSize: 292,560
+#  zoomLevels: 9
+#  chromCount: 5335
+#  basesCovered: 1,635,503,835
+#  meanDepth (of bases covered): 14.396921
+#  minDepth: 1.000000
+#  maxDepth: 381.000000
+#  std of depth: 29.341113
+
+#########################################################################
+# ucscToINSDC and ucscToRefSeq table/track (DONE - 2020-07-17 - Hiram)
+    # construct idKeys for the genbank sequence
+    mkdir /hive/data/genomes/canFam5/genbank/idKeys
+    cd /hive/data/genomes/canFam5/genbank/idKeys
+    faToTwoBit ../GCA_*1_genomic.fna.gz canFam5.genbank.2bit
+
+    time (doIdKeys.pl -buildDir=`pwd` \
+        -twoBit=`pwd`/canFam5.genbank.2bit genbankCanFam5)  > do.log 2>&1 &
+    # real    3m30.599s
+
+    cat genbankCanFam5.keySignature.txt
+    #  174191aae5515d1114a9d6320b152b1a
+
+    mkdir /hive/data/genomes/canFam5/bed/chromAlias
+    cd /hive/data/genomes/canFam5/bed/chromAlias
+
+    join -t$'\t' ../idKeys/canFam5.idKeys.txt \
+        ../../genbank/idKeys/genbankCanFam5.idKeys.txt | cut -f2- \
+          | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
+            | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
+               | sort -k1,1 -k2,2n > ucscToINSDC.bed
+
+    # should be same line counts throughout:
+    wc -l * ../../chrom.sizes
+    #   2198 ucscToINSDC.bed
+    #	2198 ../../chrom.sizes
+
+    export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1`
+    echo $chrSize
+    # 23
+    # use the $chrSize in this sed
+    sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
+         | hgLoadSqlTab canFam5 ucscToINSDC stdin ucscToINSDC.bed
+
+    # should be quiet for all OK
+    checkTableCoords canFam5
+
+    # should cover %100 entirely:
+    featureBits -countGaps canFam5 ucscToINSDC
+    # 2482000080 bases of 2482000080 (100.000%) in intersection
+
+#########################################################################
+# add chromAlias table (TBD - 2020-05-20 - Hiram)
+
+    mkdir /hive/data/genomes/canFam5/bed/chromAlias
+    cd /hive/data/genomes/canFam5/bed/chromAlias
+
+    hgsql -N -e 'select chrom,name from ucscToRefSeq;' canFam5 \
+        | sort -k1,1 > ucsc.refseq.tab
+    hgsql -N -e 'select chrom,name from ucscToINSDC;' canFam5 \
+        | sort -k1,1 > ucsc.genbank.tab
+
+    wc -l *.tab
+    #	2198 ucsc.genbank.tab
+
+    ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \
+        > canFam5.chromAlias.tab
+
+for t in genbank
+do
+  c0=`cat ucsc.$t.tab | wc -l`
+  c1=`grep $t canFam5.chromAlias.tab | wc -l`
+  ok="OK"
+  if [ "$c0" -ne "$c1" ]; then
+     ok="ERROR"
+  fi
+  printf "# checking $t: $c0 =? $c1 $ok\n"
+done
+# checking genbank: 2198 =? 2198 OK
+
+    # verify chrM is here properly:
+    grep chrM canFam5.chromAlias.tab 
+# CM022001.1      chrM    genbank
+
+    hgLoadSqlTab canFam5 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
+        canFam5.chromAlias.tab
+
+#########################################################################
+# fixup search rule for assembly track/gold table (DONE - 2020-07-17 - Hiram)
+    cd ~/kent/src/hg/makeDb/trackDb/dog/canFam5
+    # preview prefixes and suffixes:
+    hgsql -N -e "select frag from gold;" canFam5 \
+      | sed -e 's/[0-9_.]\+//;' | sort | uniq -c 
+   1037 CM
+    758 REHQ
+
+    # implies a rule: '[CR][ME][HQ0-9]+(\.[0-9_]+)?'
+
+    # verify this rule will find them all and eliminate them all:
+    hgsql -N -e "select frag from gold;" canFam5 | wc -l
+    # 1795
+
+    hgsql -N -e "select frag from gold;" canFam5 \
+       | egrep -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l
+    # 1795
+
+    hgsql -N -e "select frag from gold;" canFam5 \
+       | egrep -v -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l
+    # 0
+
+    # hence, add to trackDb/rhesus/canFam5/trackDb.ra
+searchTable gold
+shortCircuit 1
+termRegex [CR][ME][HQ0-9]+(\.[0-9_]+)?
+query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
+searchPriority 8
+
+    # verify searches work in the position box
+
+    git commit -m 'adding search rule for gold/assembly track refs #25917' \
+       trackDb.ra
+
+##########################################################################
+# running repeat masker (DONE - 2020-07-17 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/repeatMasker
+    cd /hive/data/genomes/canFam5/bed/repeatMasker
+    time  (doRepeatMasker.pl -buildDir=`pwd` \
+        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -smallClusterHub=ku canFam5) > do.log 2>&1
+XXX - running - Fri Jul 17 16:57:56 PDT 2020
+    # real    293m51.353s
+
+    cat faSize.rmsk.txt
+# 2482000080 bases (58500 N's 2481941580 real 1403544550 upper
+#	1078397030 lower) in 2198 sequences in 1 files
+# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
+#	max 124992030 (chrX) median 43246
+# %43.45 masked total, %43.45 masked real
+
+    egrep -i "versi|relea" do.log
+# RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $
+# grep version of RepeatMasker$ /hive/data/staging/data/RepeatMasker/RepeatMasker
+# February 01 2017 (open-4-0-8) 1.332 version of RepeatMasker
+# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
+# CC    Dfam_Consensus RELEASE 20181026;                            *
+# CC    RepBase RELEASE 20181026;                                   *
+
+    time featureBits -countGaps canFam5 rmsk
+    # 1078398935 bases of 2482000080 (43.449%) in intersection
+    # real    0m35.578s
+
+    # why is it different than the faSize above ?
+    # because rmsk masks out some N's as well as bases, the faSize count above
+    #   separates out the N's from the bases, it doesn't show lower case N's
+
+    # faster way to get the same result on high contig count assemblies:
+    time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' canFam5 \
+        | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total"
+    #  total 1078398935.000000
+    #  real    0m22.013s
+
+##########################################################################
+# running simple repeat (DONE - 2020-07-17 - Hiram)
+
+    mkdir /hive/data/genomes/canFam5/bed/simpleRepeat
+    cd /hive/data/genomes/canFam5/bed/simpleRepeat
+    time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
+        -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
+        -trf409=6 canFam5) > do.log 2>&1
+    # real    7m53.400s
+
+    cat fb.simpleRepeat
+    # 42156507 bases of 2337131234 (1.804%) in intersection
+
+XXX - ready for masking - 2020-07-17
+    cd /hive/data/genomes/canFam5
+    # if using the Window Masker result:
+    cd /hive/data/genomes/canFam5
+#    twoBitMask bed/windowMasker/canFam5.cleanWMSdust.2bit \
+#       -add bed/simpleRepeat/trfMask.bed  canFam5.2bit
+    #   you can safely ignore the warning about fields >= 13
+
+    # add to rmsk after it is done:
+    twoBitMask canFam5.rmsk.2bit \
+        -add bed/simpleRepeat/trfMask.bed canFam5.2bit
+    #   you can safely ignore the warning about fields >= 13
+    twoBitToFa canFam5.2bit stdout | faSize stdin > faSize.canFam5.2bit.txt
+    cat faSize.canFam5.2bit.txt
+# 2482000080 bases (58500 N's 2481941580 real 1401386884 upper
+#	1080554696 lower) in 2198 sequences in 1 files
+# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
+#	max 124992030 (chrX) median 43246
+# %43.54 masked total, %43.54 masked real
+
+    rm /gbdb/canFam5/canFam5.2bit
+    ln -s `pwd`/canFam5.2bit /gbdb/canFam5/canFam5.2bit
+
+#########################################################################
+# CREATE MICROSAT TRACK (TBD - 2020-03-31 - Hiram)
+    ssh hgwdev
+    mkdir /cluster/data/canFam5/bed/microsat
+    cd /cluster/data/canFam5/bed/microsat
+
+    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
+         ../simpleRepeat/simpleRepeat.bed > microsat.bed
+
+    hgLoadBed canFam5 microsat microsat.bed
+    # Read 65981 elements of size 4 from microsat.bed
+
+##########################################################################
+## WINDOWMASKER (TBD - 2020-03-31 - Hiram)
+
+    mkdir /hive/data/genomes/canFam5/bed/windowMasker
+    cd /hive/data/genomes/canFam5/bed/windowMasker
+    time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
+        -dbHost=hgwdev canFam5) > do.log 2>&1
+    # real    90m16.169s
+
+    # Masking statistics
+    cat faSize.canFam5.cleanWMSdust.txt
+# 2482000080 bases (58500 N's 2481941580 real 1630728232 upper 851213348 lower)
+#	in 2198 sequences in 1 files
+# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1)
+#	max 124992030 (chrX) median 43246
+# %34.30 masked total, %34.30 masked real
+
+    cat fb.canFam5.rmsk.windowmaskerSdust.txt
+    # 598271411 bases of 2482000080 (24.104%) in intersection
+
+##########################################################################
+# cpgIslands - (TBD - 2020-04-02 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/cpgIslands
+    cd /hive/data/genomes/canFam5/bed/cpgIslands
+    time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
+      -workhorse=hgwdev -smallClusterHub=ku canFam5) > do.log 2>&1
+    # real    3m29.034s
+
+    cat fb.canFam5.cpgIslandExt.txt
+    # 47618882 bases of 2481941580 (1.919%) in intersection
+
+##############################################################################
+# genscan - (TBD - 2020-04-02 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/genscan
+    cd /hive/data/genomes/canFam5/bed/genscan
+    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
+      -bigClusterHub=ku canFam5) > do.log 2>&1
+    # real    8m19.775s
+
+    # two jobs broken:
+./runGsBig2M.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed &
+./runGsBig2M.csh chr34 000 gtf/000/chr34.gtf pep/000/chr34.pep subopt/000/chr34.bed
+wait
+    # real    14m27.845s
+
+    time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
+      -continue=makeBed -bigClusterHub=ku canFam5) > makeBed.log 2>&1
+    # real    0m45.365s
+
+    cat fb.canFam5.genscan.txt
+    # 57650331 bases of 2481941580 (2.323%) in intersection
+
+    cat fb.canFam5.genscanSubopt.txt
+    # 50129491 bases of 2481941580 (2.020%) in intersection
+
+#########################################################################
+# Create kluster run files (TBD - 2020-04-02 - Hiram)
+
+    # numerator is canFam5 gapless bases "real" as reported by:
+    featureBits -noRandom -noHap canFam5 gap
+    # 36700 bases of 2353522726 (0.002%) in intersection
+    #                      ^^^
+
+    # denominator is hg19 gapless bases as reported by:
+    #   featureBits -noRandom -noHap hg19 gap
+    #     234344806 bases of 2861349177 (8.190%) in intersection
+    # 1024 is threshold used for human -repMatch:
+    calc \( 2353522726 / 2861349177 \) \* 1024
+    #  ( 2353522726 / 2861349177 ) * 1024 = 842.262556
+
+    # ==> use -repMatch=800 according to size scaled down from 1024 for human.
+    #   and rounded down to nearest 50
+    cd /hive/data/genomes/canFam5
+    time blat canFam5.2bit \
+         /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/canFam5.11.ooc \
+        -repMatch=800
+    #	Wrote 34718 overused 11-mers to jkStuff/canFam5.11.ooc
+    #	real    0m21.985s
+
+    # canFam3 at repMatch=900:
+    #   Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc
+    #	real    1m11.629s
+
+    #   there are no non-bridged gaps
+    hgsql -N \
+        -e 'select * from gap where bridge="no" order by size;' canFam5 \
+
+    # HOWEVER, every gap in this assembly is the same 'within scaffold'
+    # at size 100:
+    hgsql -N -e 'select size from gap where bridge="yes" order by size;'
+     canFam5  | sort | uniq -c
+    # 585 100
+
+    # using these gaps to make a lift file
+    # minimum gap size is 100 and produces a reasonable number of lifts
+    gapToLift -verbose=2 -minGap=100 canFam5 jkStuff/canFam5.nonBridged.lft \
+        -bedFile=jkStuff/canFam5.nonBridged.bed
+    wc -l jkStuff/canFam5.nonBri*
+    #	2198 jkStuff/canFam5.nonBridged.bed
+    #	2198 jkStuff/canFam5.nonBridged.lft
+
+########################################################################
+# lastz/chain/net swap human/hg38 (TBD - 2020-04-10 - Hiram)
+
+    # original alignment
+    cd /hive/data/genomes/hg38/bed/lastzCanFam5.2020-04-02
+
+    cat fb.hg38.chainCanFam5Link.txt
+    # 1549397508 bases of 3110768607 (49.808%) in intersection
+    cat fb.hg38.chainSynCanFam5Link.txt
+    # 1488468205 bases of 3110768607 (47.849%) in intersection
+
+    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
+	hg38 canFam5) > rbest.log 2>&1 &
+    # real    310m32.196s
+
+    cat fb.hg38.chainRBest.CanFam5.txt
+    # 1425406620 bases of 3110768607 (45.822%) in intersection
+
+    # and for the swap:
+    mkdir /hive/data/genomes/canFam5/bed/blastz.hg38.swap
+    cd /hive/data/genomes/canFam5/bed/blastz.hg38.swap
+
+    time (doBlastzChainNet.pl -verbose=2 \
+      /hive/data/genomes/hg38/bed/lastzCanFam5.2020-04-02/DEF \
+        -swap -chainMinScore=3000 -chainLinearGap=medium \
+          -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
+            -syntenicNet) > swap.log 2>&1
+    #  real    99m10.990s
+
+    cat fb.canFam5.chainHg38Link.txt
+    # 1493209286 bases of 2481941580 (60.163%) in intersection
+    cat fb.canFam5.chainSynHg38Link.txt
+    # 1448164376 bases of 2481941580 (58.348%) in intersection
+
+    time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \
+	canFam5 hg38) > rbest.log 2>&1 &
+    # real    257m59.713s
+
+    cat fb.canFam5.chainRBest.Hg38.txt
+    # 1425296830 bases of 2481941580 (57.427%) in intersection
+
+###########################################################################
+# lastz/chain/net swap mouse/mm10 (TBD - 2020-04-20 - Hiram)
+
+    # original alignment
+    cat fb.mm10.chainCanFam5Link.txt
+    #	777883731 bases of 2652783500 (29.323%) in intersection
+    cat fb.mm10.chainSynCanFam5Link.txt
+    #   736602602 bases of 2652783500 (27.767%) in intersection
+
+    time (doRecipBest.pl -load -workhorse=hgwdev mm10 canFam5 \
+      -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 &
+    #	real    219m16.168s
+
+    cat fb.mm10.chainRBest.CanFam5.txt
+    # 741307883 bases of 2652783500 (27.945%) in intersection
+
+    mkdir /hive/data/genomes/canFam5/bed/blastz.mm10.swap
+    cd /hive/data/genomes/canFam5/bed/blastz.mm10.swap
+    time (doBlastzChainNet.pl -verbose=2 \
+	/hive/data/genomes/mm10/bed/lastzCanFam5.2020-04-02/DEF \
+	-swap -syntenicNet \
+	-workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \
+	-chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 &
+    #	real    50m20.639s
+
+    cat fb.canFam5.chainMm10Link.txt
+    #	772902855 bases of 2481941580 (31.141%) in intersection
+    cat fb.canFam5.chainSynMm10Link.txt
+    #   737924732 bases of 2481941580 (29.732%) in intersection
+
+    time (doRecipBest.pl -load -workhorse=hgwdev canFam5 mm10 \
+      -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 &
+    # real    173m38.016s
+
+    cat fb.canFam5.chainRBest.Mm10.txt
+    # 740357755 bases of 2481941580 (29.830%) in intersection
+
+##############################################################################
+# GENBANK AUTO UPDATE (TBD - 2020-04-09 - Hiram)
+    ssh hgwdev
+    cd $HOME/kent/src/hg/makeDb/genbank
+    git pull
+    # /cluster/data/genbank/data/organism.lst shows:
+    # organism       mrnaCnt estCnt  refSeqCnt
+    # Canis latrans   2       0       0
+    # Canis lupus     36      0       0
+    # Canis lupus familiaris  3351    382644  1718
+    # Canis lupus laniger     2       0       0
+    # Canis lupus lupus       2       0       0
+    # Canis mesomelas 1       0       0
+    # Canis sp.       45      0       0
+
+    # the latrans is the Coyota, the mesomelas
+    # is the Black-backed jackal from Africa and the langier is the Tibetan wolf
+    # lupus lupus is the Eurasian wolf
+
+    # edit etc/genbank.conf to add canFam5 just after canFam3
+
+# canFam5 (German shepard - GCA_011100685.1 - UU_Cfam_GSD_1.0)
+canFam5.serverGenome = /hive/data/genomes/canFam5/canFam5.2bit
+canFam5.ooc = /hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc
+canFam5.lift = /hive/data/genomes/canFam5/jkStuff/canFam5.nonBridged.lft
+canFam5.align.unplacedChroms = chrUn_*
+canFam5.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
+canFam5.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
+canFam5.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
+canFam5.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
+canFam5.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
+canFam5.refseq.mrna.native.load = yes
+canFam5.refseq.mrna.xeno.load = yes
+# DO NOT NEED genbank.mrna.xeno except for human, mouse
+canFam5.genbank.mrna.xeno.load = yes
+canFam5.downloadDir = canFam5
+canFam5.upstreamGeneTbl = refGene
+canFam5.perChromTables = no
+
+    # verify the files specified exist before checking in the file:
+  grep ^canFam5 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og
+# -rw-rw-r-- 1 651703337 Apr  2 08:57 /hive/data/genomes/canFam5/canFam5.2bit
+# -rw-rw-r-- 1    138880 Apr  2 09:51 /hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc
+# -rw-rw-r-- 1    139818 Apr  2 09:56 /hive/data/genomes/canFam5/jkStuff/canFam5.nonBridged.lft
+
+    git commit -m "Added canFam5 dog; refs #25917" etc/genbank.conf
+    git push
+
+    # update /cluster/data/genbank/:
+    make etc-update
+
+    # enable daily alignment and update of hgwdev
+    cd ~/kent/src/hg/makeDb/genbank
+    git pull
+    # add canFam5 to:
+    #   etc/hgwdev.dbs etc/align.dbs
+    git commit -m "Added canFam5 - dog refs #25917" etc/hgwdev.dbs etc/align.dbs
+    git push
+    make etc-update
+
+    # wait a few days for genbank magic to take place, the tracks will
+    # appear
+
+#############################################################################
+# augustus gene track (TBD - 2020-04-10 - Hiram)
+
+    mkdir /hive/data/genomes/canFam5/bed/augustus
+    cd /hive/data/genomes/canFam5/bed/augustus
+    time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
+        -species=human -dbHost=hgwdev \
+           -workhorse=hgwdev canFam5) > do.log 2>&1
+    # real    74m39.734s
+
+    cat fb.canFam5.augustusGene.txt
+    # 49999966 bases of 2481941580 (2.015%) in intersection
+
+#########################################################################
+# ncbiRefSeq (TBD - 2019-11-20 - Hiram)
+    ### XXX ### Not available on GCA/genbank assemblies
+
+    mkdir /hive/data/genomes/canFam5/bed/ncbiRefSeq
+    cd /hive/data/genomes/canFam5/bed/ncbiRefSeq
+    # running step wise just to be careful
+    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
+      -bigClusterHub=ku -dbHost=hgwdev \
+      -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
+      refseq vertebrate_mammalian Gorilla_gorilla \
+      GCA_008122165.1_Kamilah_GGO_v0 canFam5) > download.log 2>&1
+    # real    1m37.523s
+
+    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
+      -continue=process -bigClusterHub=ku -dbHost=hgwdev \
+      -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
+      refseq vertebrate_mammalian Gorilla_gorilla \
+      GCF_008122165.1_Kamilah_GGO_v0 canFam5) > process.log 2>&1
+    # real    2m9.450s
+
+    time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
+      -continue=load -bigClusterHub=ku -dbHost=hgwdev \
+      -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \
+      refseq vertebrate_mammalian Gorilla_gorilla \
+      GCF_008122165.1_Kamilah_GGO_v0 canFam5) > load.log 2>&1
+    # real    0m21.982s
+
+    cat fb.ncbiRefSeq.canFam5.txt
+    #  74279781 bases of 2999027915 (2.477%) in intersection
+
+    # add: include ../../refSeqComposite.ra alpha
+    # to the gorilla/canFam5/trackDb.ra to turn on the track in the browser
+
+    # XXX 2019-11-20 - ready for this after genbank runs
+
+    featureBits -enrichment canFam5 refGene ncbiRefSeq 
+ # refGene 0.402%, ncbiRefSeq 3.148%, both 0.402%, cover 99.90%, enrich 31.73x
+    featureBits -enrichment canFam5 ncbiRefSeq refGene
+ # ncbiRefSeq 3.148%, refGene 0.402%, both 0.402%, cover 12.76%, enrich 31.73x
+
+    featureBits -enrichment canFam5 ncbiRefSeqCurated refGene
+ # ncbiRefSeqCurated 0.401%, refGene 0.402%, both 0.400%, cover 99.66%, enrich 247.79x
+
+    featureBits -enrichment canFam5 refGene ncbiRefSeqCurated
+ # refGene 0.402%, ncbiRefSeqCurated 0.401%, both 0.400%, cover 99.33%, enrich 247.79x
+
+#########################################################################
+# LIFTOVER TO canFam3 (TBD - 2020-04-02 - Hiram)
+    ssh hgwdev
+    mkdir /hive/data/genomes/canFam5/bed/blat.canFam3.2020-04-02
+    cd /hive/data/genomes/canFam5/bed/blat.canFam3.2020-04-02
+    doSameSpeciesLiftOver.pl -verbose=2 \
+        -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -ooc=/hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc \
+         canFam5 canFam3
+    time (doSameSpeciesLiftOver.pl -verbose=2 \
+        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -ooc=/hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc \
+         canFam5 canFam3) > doLiftOverToCanFam3.log 2>&1
+    # real    1100m17.743s
+
+    # see if the liftOver menus function in the browser from canFam5 to canFam3
+
+#########################################################################
+#  BLATSERVERS ENTRY (TBD - 2020-04-02 - Hiram)
+#	After getting a blat server assigned by the Blat Server Gods,
+    ssh hgwdev
+
+    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("canFam5", "blat1b", "17904", "1", "0"); \
+	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
+	VALUES ("canFam5", "blat1b", "17905", "0", "1");' \
+	    hgcentraltest
+    #	test it with some sequence
+
+############################################################################
+## reset default position to gene: CDH2 upon recommendation from Kerstin
+##  (TBD - 2020-06-22 - Hiram)
+
+    ssh hgwdev
+    hgsql -e 'update dbDb set defaultPos="chr7:60683331-61003907"
+	where name="canFam5";' hgcentraltest
+
+##############################################################################
+# crispr whole genome (TBD - 2020-04-09 - Hiram)
+    mkdir /hive/data/genomes/canFam5/bed/crisprAll
+    cd /hive/data/genomes/canFam5/bed/crisprAll
+
+    # the large shoulder argument will cause the entire genome to be scanned
+    # this takes a while for a new genome to get the bwa indexing done
+    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \
+    canFam5 genscan -shoulder=250000000 -tableName=crisprAll \
+    -fileServer=hgwdev \
+    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
+      -workhorse=hgwdev) > ranges.log 2>&1
+    # real    1m16.539s
+
+    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
+       -continue=guides -stop=specScores canFam5 genscan \
+	-shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \
+    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
+      -workhorse=hgwdev) > specScores.log 2>&1
+    # real    6558m26.295s
+
+    cat guides/run.time | sed -e 's/^/# /;'
+# Completed: 100 of 100 jobs
+# CPU time in finished jobs:      11979s     199.66m     3.33h    0.14d  0.000 y
+# IO & Wait Time:                   251s       4.18m     0.07h    0.00d  0.000 y
+# Average job time:                 122s       2.04m     0.03h    0.00d
+# Longest finished job:             289s       4.82m     0.08h    0.00d
+# Submission to last job:           303s       5.05m     0.08h    0.00d
+
+    cat specScores/run.time | sed -e 's/^/# /;'
+# Completed: 3096565 of 3096565 jobs
+# CPU time in finished jobs:  263946983s 4399116.38m 73318.61h 3054.94d  8.370 y
+# IO & Wait Time:              17766691s  296111.52m  4935.19h  205.63d  0.563 y
+# Average job time:                  91s       1.52m     0.03h    0.00d
+# Longest finished job:             851s      14.18m     0.24h    0.01d
+# Submission to last job:        324649s    5410.82m    90.18h    3.76d
+
+# # Number of specScores: 233102255
+
+    ### remember to get back to hgwdev to run this
+    time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \
+       -continue=effScores -stop=load canFam5 genscan \
+    -shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \
+    -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \
+      -workhorse=hgwdev) > load.log 2>&1
+    #  real    932m13.229s
+
+    cat effScores/run.time | sed -e 's/^/# /;'
+# Completed: 25662 of 25662 jobs
+# CPU time in finished jobs:   12763858s  212730.96m  3545.52h  147.73d  0.405 y
+# IO & Wait Time:                144123s    2402.05m    40.03h    1.67d  0.005 y
+# Average job time:                 503s       8.38m     0.14h    0.01d
+# Longest finished job:            4091s      68.18m     1.14h    0.05d
+# Submission to last job:         15067s     251.12m     4.19h    0.17d
+
+    cat offTargets/run.time | sed -e 's/^/# /;'
+# Completed: 154829 of 154829 jobs
+# CPU time in finished jobs:    1805712s   30095.20m   501.59h   20.90d  0.057 y
+# IO & Wait Time:               3128264s   52137.73m   868.96h   36.21d  0.099 y
+# Average job time:                  32s       0.53m     0.01h    0.00d
+# Longest finished job:             273s       4.55m     0.08h    0.00d
+# Submission to last job:          5337s      88.95m     1.48h    0.06d
+
+#########################################################################
+# all.joiner update, downloads and in pushQ - (WORKING - 2019-11-20 - Hiram)
+    cd $HOME/kent/src/hg/makeDb/schema
+    # verify all the business is done for release
+    ~/kent/src/hg/utils/automation/verifyBrowser.pl canFam5
+# 66 tables in database canFam5 - Dog, Canis lupus familiaris
+# verified 55 tables in database canFam5, 11 extra tables, 14 optional tables
+# chainNetRBestHg38     3 optional tables
+# chainNetRBestMm10     3 optional tables
+# chainNetSynHg38       3 optional tables
+# chainNetSynMm10       3 optional tables
+# gapOverlap    1 optional tables
+# tandemDups    1 optional tables
+# 1     chainCanFam3    - extra table
+# 2     chainCanFam3Link        - extra table
+# 3     chainRBestCanFam3       - extra table
+# 4     chainRBestCanFam3Link   - extra table
+# . . . etc . . .
+# 8     crisprAllTargets        - extra table
+# 9     netCanFam3      - extra table
+# 10    netRBestCanFam3 - extra table
+# 11    netSynCanFam3   - extra table
+# 13 genbank tables found
+# verified 28 required tables, 1 missing tables
+# 1     ucscToRefSeq    - missing table
+# hg38 chainNet to canFam5 found 3 required tables
+# mm10 chainNet to canFam5 found 3 required tables
+# hg38 chainNet RBest and syntenic to canFam5 found 6 optional tables
+# mm10 chainNet RBest and syntenic to canFam5 found 3 optional tables
+# liftOver to previous versions: 1, from previous versions: 1
+
+    # fixup all.joiner until this is a clean output
+    joinerCheck -database=canFam5 -tableCoverage all.joiner
+    joinerCheck -database=canFam5 -times all.joiner
+    joinerCheck -database=canFam5 -keys all.joiner
+
+    # when clean, check in:
+    git commit -m 'adding rules for canFam5 refs #25917' all.joiner
+    git push
+    # run up a 'make alpha' in hg/hgTables to get this all.joiner file
+    # into the hgwdev/genome-test system
+
+    cd /hive/data/genomes/canFam5
+    time (makeDownloads.pl canFam5) > downloads.log 2>&1
+    #  real    16m11.233s
+
+    #   now ready for pushQ entry
+    mkdir /hive/data/genomes/canFam5/pushQ
+    cd /hive/data/genomes/canFam5/pushQ
+ time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList canFam5) > canFam5.pushQ.sql 2> stderr.out
+    # real    15m2.385s
+XXXX
+
+    # remove the tandemDups and gapOverlap from the file list:
+    sed -i -e "/tandemDups/d" redmine.canFam5.table.list
+    sed -i -e "/Tandem Dups/d" redmine.canFam5.releaseLog.txt
+    sed -i -e "/gapOverlap/d" redmine.canFam5.table.list
+    sed -i -e "/Gap Overlaps/d" redmine.canFam5.releaseLog.txt
+
+    #   check for errors in stderr.out, some are OK, e.g.:
+  # WARNING: canFam5 does not have ucscToRefSeq
+  # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqVersion.txt
+  # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.bb
+  # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.ix
+  # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.ixx
+  # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/seqNcbiRefSeq.rna.fa
+  # WARNING: canFam5 does not have seq
+  # WARNING: canFam5 does not have extFile
+
+    # verify the file list does correctly match to files
+    cat redmine.canFam5.file.list | while read L
+do
+  eval ls $L > /dev/null
+done
+    # should be silent, missing files will show as errors
+
+    # verify database tables, how many to expect:
+    wc -l redmine.canFam5.table.list
+    # 52 redmine.canFam5.table.list
+
+    # how many actual:
+    awk -F'.' '{printf "hgsql -N %s -e '"'"'show table status like \"%s\";'"'"'\n", $1, $2}' redmine.canFam5.table.list | sh | wc -l
+    # 52
+
+    # would be a smaller number actual if some were missing
+
+    # add the path names to the listing files in the redmine issue
+    # in the three appropriate entry boxes:
+
+#	/hive/data/genomes/canFam5/pushQ/redmine.canFam5.file.list
+#	/hive/data/genomes/canFam5/pushQ/redmine.canFam5.releaseLog.txt
+#	/hive/data/genomes/canFam5/pushQ/redmine.canFam5.table.list
+
+#########################################################################