src/hg/makeDb/doc/danRer10/initialBuild.txt 56f91678d7671b26ab3d70711429e52c26585a03

56f91678d7671b26ab3d70711429e52c26585a03
hiram
  Mon Nov 6 13:21:34 2023 -0800
adding crispr tracks for danRer10 danRer11 refs #21863

diff --git src/hg/makeDb/doc/danRer10/initialBuild.txt src/hg/makeDb/doc/danRer10/initialBuild.txt
index 0c61414..27d140e 100644
--- src/hg/makeDb/doc/danRer10/initialBuild.txt
+++ src/hg/makeDb/doc/danRer10/initialBuild.txt
@@ -1,603 +1,682 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes browser build for the danRer10
 
 # Assembly Name:  GRCz10
 # Description:    Genome Reference Consortium Zebrafish Build 10
 # Organism name:  Danio rerio
 # Taxid:          7955
 # Submitter:      Genome Reference Consortium
 # Date:           2014-9-2
 # BioSample:      SAMN03020626
 # Assembly type:  haploid
 # Release type:   major
 # Assembly level: Chromosome
 # Genome representation: full
 # GenBank Assembly Accession: GCA_000002035.3 (latest)
 # RefSeq Assembly Accession: GCF_000002035.5 (species-representative latest)
 # RefSeq Assembly and GenBank Assemblies Identical: no
 #
 ## Assembly-Units:
 ## GenBank Unit Accession       RefSeq Unit Accession   Assembly-Unit name
 ## GCA_000000175.3      GCF_000000175.4 Primary Assembly
 ##      GCF_000002055.1 non-nuclear
 
 #############################################################################
 # fetch sequence from new style download directory (DONE - 2015-01-22 - Hiram)
     # now using the new NCBI naming scheme hierarchy
     mkdir -p /hive/data/genomes/danRer10/genbank
     cd /hive/data/genomes/danRer10/genbank
 
     time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.3_GRCz10/ ./
     # real    1m18.054s
 
     # measure what we have here:
     faSize GCA_000002035.3_GRCz10_genomic.fna.gz
 
     # 1371702787 bases (2087465 N's 1369615322 real 696150664 upper
     #   673464658 lower) in 1060 sequences in 1 files
     # Total size: mean 1294059.2 sd 8275856.7 min 650 (KN150525.1)
     #   max 76625712 (CM002888.1) median 18168
     # %49.10 masked total, %49.17 masked real
 
     # note that these totals do not include chrM since the GenBank ftp directory
     # did not include a non-nuclear directory
 
     # same measurement on the individual files:
 
     faSize GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz \
       GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz
     # 1371702787 bases (2087465 N's 1369615322 real 1369615322 upper
     #    0 lower) in 1060 sequences in 26 files
     # Total size: mean 1294059.2 sd 8275856.7 min 650 (KN150525.1)
     #    max 76625712 (CM002888.1) median 18168
     # %0.00 masked total, %0.00 masked real
 
 #############################################################################
 # fixup to UCSC naming scheme (DONE - 2015-01-22 - Hiram)
     mkdir /hive/data/genomes/danRer10/ucsc
     cd /hive/data/genomes/danRer10/ucsc
 
     time ~/kent/src/hg/makeDb/doc/danRer10/ucscCompositeAgp.pl \
         ../genbank/GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly
 
     # this unplaced.pl script is generic enough at this point, it should
     # go into utils/automation/
     time ~/kent/src/hg/makeDb/doc/danRer10/unplaced.pl \
         ../genbank/GCA_000002035.3_GRCz10_assembly_structure/Primary_Assembly
     zcat ../genbank/GCA_000164805.2_Tarsius_syrichta-2.0.1_assembly_structure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \
 
     # verify we haven't lost anything:
     faSize chr*.fa
     # 1371702787 bases (2087465 N's 1369615322 real 1369615322 upper 0 lower)
     #    in 1060 sequences in 26 files
     # Total size: mean 1294059.2 sd 8275856.7 min 650 (chrUn_KN150525v1)
     #    max 76625712 (chr4) median 18168
     # %0.00 masked total, %0.00 masked real
 
     # same numbers as above.
 
     time gzip chr*.fa chr*.agp
     #  real    7m27.555s
 
 #############################################################################
 #  Initial database build (DONE - 2014-11-20 - Hiram)
 
     cd /hive/data/genomes/danRer10
     cat << '_EOF_' > danRer10.config.ra
 # Config parameters for makeGenomeDb.pl:
 db danRer10
 clade vertebrate
 scientificName Danio rerio
 commonName Zebrafish
 assemblyDate Sep. 2014
 assemblyLabel Genome Reference Consortium Zebrafish Build 10
 assemblyShortLabel Zv10
 orderKey 26173
 fastaFiles /hive/data/genomes/danRer10/ucsc/chr*.fa.gz
 agpFiles /hive/data/genomes/danRer10/ucsc/chr*.agp.gz
 # qualFiles none
 dbDbSpeciesDir zebrafish
 mitoAcc NC_002333.2
 photoCreditURL http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Zebrafish&id=79130
 photoCreditName NHGRI Press Photos
 ncbiGenomeId 50
 ncbiAssemblyId 210611
 ncbiAssemblyName GRCz10
 ncbiBioProject 13922
 genBankAccessionID GCA_000002035.3
 taxId 7955
 '_EOF_'
     # << happy emacs
 
     # verify sequence and AGP are OK:
     time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
          -stop=agp danRer10.config.ra > agp.log 2>&1
     # real    1m21.490s
 
     # then finish it off:
     time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \
        -fileServer=hgwdev -continue=db danRer10.config.ra > db.log 2>&1
     # real    11m8.084s
 
     # check in the trackDb files created here
 
 ##########################################################################
 # running repeat masker (DONE - 2015-01-22 - Hiram)
     mkdir /hive/data/genomes/danRer10/bed/repeatMasker
     cd /hive/data/genomes/danRer10/bed/repeatMasker
     time  (doRepeatMasker.pl -buildDir=`pwd` \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         -smallClusterHub=ku danRer10) > do.log 2>&1
     # real    904m9.491s
     # had to patch ProcessRepeats.pl to get this to finish in the RM step
     # then continuing
     time  (doRepeatMasker.pl -buildDir=`pwd` \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
         -continue=cat -smallClusterHub=ku danRer10) > cat.log 2>&1
     #  real    20m54.276s
 
 
     cat faSize.rmsk.txt
     # 1371719383 bases (2087465 N's 1369631918 real 654384100 upper
     #   715247818 lower) in 1061 sequences in 1 files
     # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
     #    max 76625712 (chr4) median 18160
     # %52.14 masked total, %52.22 masked real
 
     egrep -i "versi|relea" do.log
     #    January 31 2015 (open-4-0-5) version of RepeatMasker
     #  CC   RELEASE 20140131;
 
     time featureBits -countGaps danRer10 rmsk
     # 715370858 bases of 1371719383 (52.151%) in intersection
     # real    0m25.047s
 
     # why is it different than the faSize above ?
     # because rmsk masks out some N's as well as bases, the count above
     #   separates out the N's from the bases, it doesn't show lower case N's
 
 ##########################################################################
 # running simple repeat (DONE 2015-01-22 - Hiram)
 
     mkdir /hive/data/genomes/danRer10/bed/simpleRepeat
     cd /hive/data/genomes/danRer10/bed/simpleRepeat
     time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \
         -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \
         danRer10) > do.log 2>&1
     # real    48m11.853s
 
     cat fb.simpleRepeat
     # 96161783 bases of 1369683683 (7.021%) in intersection
 
     # add to rmsk after it is done:
     cd /hive/data/genomes/danRer10
     twoBitMask danRer10.rmsk.2bit \
         -add bed/simpleRepeat/trfMask.bed danRer10.2bit
     #   you can safely ignore the warning about fields >= 13
 
     twoBitToFa danRer10.2bit stdout | faSize stdin > faSize.danRer10.2bit.txt
     cat faSize.danRer10.2bit.txt
     # 1371719383 bases (2087465 N's 1369631918 real 653073010 upper
     #     716558908 lower) in 1061 sequences in 1 files
     # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
     #    max 76625712 (chr4) median 18160
     # %52.24 masked total, %52.32 masked real
 
     rm /gbdb/danRer10/danRer10.2bit
     ln -s `pwd`/danRer10.2bit /gbdb/danRer10/danRer10.2bit
 
 ##########################################################################
 ## WINDOWMASKER (DONE - 2014-11-21 - Hiram)
 
     mkdir /hive/data/genomes/danRer10/bed/windowMasker
     cd /hive/data/genomes/danRer10/bed/windowMasker
     time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
         -dbHost=hgwdev danRer10) > do.log 2>&1
     # real    real    81m14.166s
 
     # Masking statistics
     cat faSize.danRer10.cleanWMSdust.txt
     # 1371719383 bases (2087465 N's 1369631918 real 682741767 upper
     #    686890151 lower) in 1061 sequences in 1 files
     # Total size: mean 1292855.2 sd 8272045.0 min 650 (chrUn_KN150525v1)
     #    max 76625712 (chr4) median 18160
     # %50.08 masked total, %50.15 masked real
 
     # run this after rmsk was done
     featureBits -countGaps danRer10 rmsk windowmaskerSdust \
        > fb.danRer10.rmsk.windowmaskerSdust.txt 2>&1
     cat fb.danRer10.rmsk.windowmaskerSdust.txt
     # 530239617 bases of 1371719383 (38.655%) in intersection
 
     # then continuing
     time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \
         -continue=cleanup -dbHost=hgwdev danRer10) > cleanup.log 2>&1
 
 ##########################################################################
 # cpgIslands - (DONE - 2015-01-26 - Hiram)
     mkdir /hive/data/genomes/danRer10/bed/cpgIslands
     cd /hive/data/genomes/danRer10/bed/cpgIslands
     time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \
       -workhorse=hgwdev -smallClusterHub=ku danRer10 > do.log 2>&1
     #  real    24m18.369s
 
     cat fb.danRer10.cpgIslandExt.txt
     # 4485867 bases of 1369683683 (0.328%) in intersection
 
 ##############################################################################
 # cpgIslands on UNMASKED sequence (DONE - 2015-01-22 - Hiram)
     mkdir /hive/data/genomes/danRer10/bed/cpgIslandsUnmasked
     cd /hive/data/genomes/danRer10/bed/cpgIslandsUnmasked
 
     time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \
        -tableName=cpgIslandExtUnmasked \
           -maskedSeq=/hive/data/genomes/danRer10/danRer10.unmasked.2bit \
              -workhorse=hgwdev -smallClusterHub=ku danRer10) > do.log 2>&1
     # real    42m5.784s
 
     cat fb.danRer10.cpgIslandExtUnmasked.txt
     # 28557104 bases of 1369683683 (2.085%) in intersection
 
 #############################################################################
 # cytoBandIdeo - (DONE - 2015-01-22 - Hiram)
     mkdir /hive/data/genomes/danRer10/bed/cytoBand
     cd /hive/data/genomes/danRer10/bed/cytoBand
     makeCytoBandIdeo.csh danRer10
 
 #########################################################################
 # genscan - (DONE 2015-01-26) - Hiram)
     mkdir /hive/data/genomes/danRer10/bed/genscan
     cd /hive/data/genomes/danRer10/bed/genscan
     time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \
       -bigClusterHub=ku danRer10) > do.log 2>&1
     # real    55m31.303s
 
     cat fb.danRer10.genscan.txt
     # 51655591 bases of 1369683683 (3.771%) in intersection
 
     cat fb.danRer10.genscanSubopt.txt
     # 33381870 bases of 1369683683 (2.437%) in intersection
 
 ########################################################################
 # Create kluster run files (DONE - 2015-01-26 - Hiram)
 
      head -1 faSize.danRer10.2bit.txt 
     # 1371719383 bases (2087465 N's 1369631918 real 653073010 upper
     #   716558908 lower) in 1061 sequences in 1 files
 
     # numerator is danRer10 gapless bases "real"
     # denominator is hg19 gapless bases as reported by:
     #   featureBits -noRandom -noHap hg19 gap
     #     234344806 bases of 2861349177 (8.190%) in intersection
     # 1024 is threshold used for human -repMatch:
     calc \( 1369631918 / 2861349177 \) \* 1024
     # ( 1369631918 / 2861349177 ) * 1024 = 490.154468
 
     # ==> use -repMatch=450 according to size scaled down from 1024 for human.
     #   and rounded down to nearest 50
     cd /hive/data/genomes/danRer10
     time blat danRer10.2bit \
          /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/danRer10.11.ooc \
         -repMatch=450
     # Wrote 47978 overused 11-mers to jkStuff/danRer10.11.ooc
     # real    0m24.503s
 
     #   all the non-bridged gaps here are size 100
     #   check non-bridged gaps to see what the typical size is:
     hgsql -N -e 'select * from gap where bridge="no" order by size;' \
        danRer10 | ave -col=7 stdin
 # Q1 100.000000
 # median 100.000000
 # Q3 100.000000
 # average 100.000000
 # min 100.000000
 # max 100.000000
 # count 2338
 # total 233800.000000
 # standard deviation 0.000000
 
     # for use with the genbank runs
     gapToLift -verbose=2 -minGap=100 bosTau7 jkStuff/nonBridged.lft \
         -bedFile=jkStuff/nonBridged.bed
 
 ########################################################################
 # GENBANK AUTO UPDATE (DONE - 2015-01-29 - Hiram)
     ssh hgwdev
     cd $HOME/kent/src/hg/makeDb/genbank
     git pull
     # /cluster/data/genbank/data/organism.lst shows:
     # #organism       mrnaCnt estCnt  refSeqCnt
     # Danio rerio     30176   1488365 14746
 
     # edit etc/genbank.conf to add danRer10 just after danRer7
 
 # danRer10 (zebrafish)
 danRer10.serverGenome = /hive/data/genomes/danRer10/danRer10.2bit
 danRer10.clusterGenome = /hive/data/genomes/danRer10/danRer10.2bit
 danRer10.ooc = /hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc
 danRer10.lift = no
 danRer10.refseq.mrna.native.pslCDnaFilter  = ${finished.refseq.mrna.native.pslCDnaFilter}
 danRer10.refseq.mrna.xeno.pslCDnaFilter    = ${finished.refseq.mrna.xeno.pslCDnaFilter}
 danRer10.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter}
 danRer10.genbank.mrna.xeno.pslCDnaFilter   = ${finished.genbank.mrna.xeno.pslCDnaFilter}
 danRer10.genbank.est.native.pslCDnaFilter  = ${finished.genbank.est.native.pslCDnaFilter}
 danRer10.genbank.mrna.xeno.load  = yes
 danRer10.downloadDir = danRer10
 danRer10.perChromTables = no
 danRer10.refseq.mrna.xeno.load  = yes
 danRer10.mgc = yes
 danRer10.orfeome = yes
 # danRer10.upstreamGeneTbl = ensGene
 # danRer10.upstreamMaf = multiz8way
 # /hive/data/genomes/danRer10/bed/multiz8way/species.lst
 
     git commit -m "Added danRer10; refs #14017" etc/genbank.conf
     git push
     # update /cluster/data/genbank/:
     make etc-update
 
 # Edit src/lib/gbGenome.c to add new species.  Skipped
 
     screen      #  control this business with a screen since it takes a while
     cd /cluster/data/genbank
 
     time ./bin/gbAlignStep -initial danRer10
     # logFile: var/build/logs/2015.01.27-11:27:05.danRer10.initalign.log
     #   real    770m4.239s
 
     #   To re-do, rm the dir first:
     #     /cluster/data/genbank/work/initial.danRer10
 
     # load database when finished
     ssh hgwdev
     cd /cluster/data/genbank
     time ./bin/gbDbLoadStep -drop -initialLoad danRer10
     # logFile: var/dbload/hgwdev/logs/2015.01.28-11:07:50.danRer10.dbload.log
     # real    97m40.551s
 
     # enable daily alignment and update of hgwdev
     cd ~/kent/src/hg/makeDb/genbank
     git pull
     # add danRer10 to:
     #   vi etc/align.dbs etc/hgwdev.dbs
     git commit -m "Added danRer10 - Zebrafish refs #14017" etc/align.dbs etc/hgwdev.dbs
     git push
     make etc-update
 
 #########################################################################
 # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram)
     hgsql -N -e "select frag from gold;" danRer10 | sort -u \
         > /tmp/danRer10.frag.gold.txt
 
 
     export maxLen=`awk '{print length($0)}' /tmp/danRer10.frag.gold.txt | sort -rn | head -1`
     echo "scan to column: $maxLen"
 
 export C=1
 while [ $C -le $maxLen ];
 do
 echo -n " $C: "
 awk '{ print substr($0,'$C',1) }' /tmp/danRer10.frag.gold.txt | sort -u | xargs echo | sed -e 's/ //g'
 C=`echo $C | awk '{print $1+1}'`
 done
  1: ABCFLN
  2: ACKLOPQRTUX
  3: 0123456789B_
  4: 0123456789Z
  5: 0123456789
  6: 0123456789
  7: 0123456789
  8: 0123456789
  9: .0123456789
  10: .0123456789
  11: 0123456789
  12: 0123456789
  13: .
  14: 1
 
     # verify this rule will find them all or eliminate them all:
     hgsql -N -e "select frag from gold;" danRer10 | wc -l
     # 34028
 
     hgsql -N -e "select frag from gold;" danRer10 \
        | egrep -e '[ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?' | wc -l
     # 34028
 
     hgsql -N -e "select frag from gold;" danRer10 \
        | egrep -v -e '[ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?' | wc -l
     # 0
 
     # hence, add to trackDb/zebrafish/danRer10/trackDb.ra
 searchTable gold
 shortCircuit 1
 termRegex [ABCFLN][ACKLOPQRTUX][B0-9_][Z0-9][0-9]+(\.[0-9]+)?
 query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%'
 searchPriority 8
 
 #########################################################################
 # ucscToINSDC table/track (DONE - 2015-01-22 - Hiram)
 
     mkdir /hive/data/genomes/danRer10/bed/ucscToINSDC
     cd /hive/data/genomes/danRer10/bed/ucscToINSDC
     # check for chrM in assembly:
     grep chrM ../../danRer10.agp
 # chrM    1       16596   4       F       NC_002333.2     1       16596   +
 
     # use the accession name from there in this command (blank if none)
     ~/kent/src/hg/utils/automation/ucscToINSDC.sh \
         ../../genbank/GCA_*assembly_structure/Primary_Assembly NC_002333.2
 
     awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \
          | sort > name.coordinate.tab
     join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \
          > ucscToINSDC.bed
     # should all be the same line count:
     wc -l *
 #  1061 name.coordinate.tab
 #  1061 ucscToINSDC.bed
 #  1061 ucscToINSDC.txt
 
     cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1
     # 16
     # use the 16 in this sed
     sed -e "s/21/16/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
          | hgLoadSqlTab danRer10 ucscToINSDC stdin ucscToINSDC.bed
     checkTableCoords danRer10
     # should cover %100 entirely:
     featureBits -countGaps danRer10 ucscToINSDC
     # 1371719383 bases of 1371719383 (100.000%) in intersection
 
 ############################################################################
 #  BLATSERVERS ENTRY (DONE - 2015-03-20 - Hiram)
 #	After getting a blat server assigned by the Blat Server Gods,
      ssh hgwdev
 
      hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("danRer10", "blat4c", "17862", "1", "0"); \
 	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
 	VALUES ("danRer10", "blat4c", "17863", "0", "1");' \
 		hgcentraltest
      #	test it with some sequence
 
 ############################################################################
 # LIFTOVER TO danRer11 (DONE - 2015-01-29 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/danRer10/bed/blat.danRer11.2017-10-20
     cd /hive/data/genomes/danRer10/bed/blat.danRer11.2017-10-20
     time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
 	-ooc=/hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
          danRer10 danRer11) > do.log 2>&1
     # real    194m46.839s
 
     # verify the convert link on the test browser is now active from danRer10 to
     # danRer11
 
 ############################################################################
 # LIFTOVER TO danRer7 (DONE - 2015-01-29 - Hiram)
     ssh hgwdev
     mkdir /hive/data/genomes/danRer10/bed/blat.danRer7.2015-01-29
     cd /hive/data/genomes/danRer10/bed/blat.danRer7.2015-01-29
     time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \
 	-ooc=/hive/data/genomes/danRer10/jkStuff/danRer10.11.ooc \
         -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
          danRer10 danRer7) > do.log 2>&1
     # real    253m49.880s
 
     # verify the convert link on the test browser is now active from danRer10 to
     # danRer7
 
 #########################################################################
 # all.joiner update, downloads and in pushQ - (DONE 2015-02-04 - Hiram)
     cd $HOME/kent/src/hg/makeDb/schema
     # fixup all.joiner until this is a clean output
     joinerCheck -database=danRer10 -keys all.joiner
     # about 50 minutes
     joinerCheck -database=danRer10 -tableCoverage all.joiner
     joinerCheck -database=danRer10 -times all.joiner
 
     cd /hive/data/genomes/danRer10
     time makeDownloads.pl danRer10 > downloads.log 2>&1
     # real    10m5.488s
 
     #   now ready for pushQ entry
     mkdir /hive/data/genomes/danRer10/pushQ
     cd /hive/data/genomes/danRer10/pushQ
     time makePushQSql.pl danRer10 > danRer10.pushQ.sql 2> stderr.out
     # real    5m47.086s
 
     #   check for errors in stderr.out, some are OK, e.g.:
 # WARNING: hgwdev does not have /gbdb/danRer10/wib/gc5Base.wib
 # WARNING: hgwdev does not have /gbdb/danRer10/wib/quality.wib
 # WARNING: hgwdev does not have /gbdb/danRer10/bbi/qualityBw/quality.bw
 # WARNING: danRer10 does not have seq
 # WARNING: danRer10 does not have extFile
 
     #   copy it to hgwbeta
     scp -p danRer10.pushQ.sql qateam@hgwbeta:/tmp
     ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/danRer10.pushQ.sql"
 
     #   in that pushQ entry walk through each entry and see if the
     #   sizes will set properly
 
 #########################################################################
 # UCSC to RefSeq name correspondence (DONE - 2015-04-15 - Hiram)
 
     mkdir /hive/data/genomes/danRer10/bed/ucscToRefSeq
     cd /hive/data/genomes/danRer10/bed/ucscToRefSeq
 
     rsync -avPL \
   rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.3_GRCz10/GCA_000002035.3_GRCz10_assembly_report.txt ./
 
     # this assembly_report has "UCSC-style-name" in column 10
     # but it does not name everything
 
     # columns 5 and 7 are the INSDC and RefSeq names
     # chrMT fixup in the sed
     grep -v "^#" GCA_000002035.3_GRCz10_assembly_report.txt \
       | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' |
         sed -e 's/^na/NC_002333.2/;' | sort > insdc.refSeq.tab
 
     hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' \
       danRer10 | sort > insdc.ucsc.tab
 
     join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \
        | cut -f2- > ucsc.refSeq.tab
 
     # when working perfectly, all these tab files have the same line count:
     wc -l *.tab
 # 1061 insdc.refSeq.tab
 # 1061 insdc.ucsc.tab
 # 1061 ucsc.refSeq.tab
 
     export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
        | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql
     hgLoadSqlTab danRer10 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab
 
     checkTableCoords  danRer10 -table=ucscToRefSeq
     # should cover %100 all bases:
     featureBits -countGaps danRer10 ucscToRefSeq
     # 2730871774 bases of 2730871774 (100.000%) in intersection
 
 #########################################################################
 # add chromAlias table (DONE - 2017-06-22 - Hiram)
 
     mkdir /hive/data/genomes/danRer10/bed/chromAlias
     cd /hive/data/genomes/danRer10/bed/chromAlias
 
     mkdir genbank
     cd genbank
     cp -p /hive/data/inside/ncbi/genomes/genbank/vertebrate_other/Danio_rerio/latest_assembly_versions/GCA_000002035.3_GRCz10/GCA_000002035.3_GRCz10.ncbi.2bit .
     time (doIdKeys.pl GCF_000002035.3_GRCz10 \
       -twoBit=`pwd`/GCF_000002035.3_GRCz10.ncbi.2bit \
          -buildDir=`pwd`) > do.log 2>&1
     # real    0m52.374s
 
     cd /hive/data/genomes/danRer10/bed/chromAlias
     join ../idKeys/danRer10.idKeys.txt \
  genbank/GCA_000002035.3_GRCz10.idKeys.txt \
     | awk '{printf "%s\t%s\n", $2,$3}' | sort > ucsc.genbank.tab
 
     cd /hive/data/genomes/danRer10/bed/chromAlias
     mkdir refseq
     cd refseq
     cp -p /hive/data/inside/ncbi/genomes/refseq/vertebrate_other/Danio_rerio/latest_assembly_versions/GCF_000002035.5_GRCz10/GCF_000002035.5_GRCz10.ncbi.2bit .
     time (doIdKeys.pl GCF_000002035.5_GRCz10 \
       -twoBit=`pwd`/GCF_000002035.5_GRCz10.ncbi.2bit \
          -buildDir=`pwd`) > do.log 2>&1
     # real    0m47.365s
 
     cd /hive/data/genomes/danRer10/bed/chromAlias
 
     join ../idKeys/danRer10.idKeys.txt \
   /hive/users/hiram/idKeys/ensembl/release-85/danio_rerio/Danio_rerio.GRCz10.idKeys.txt \
     | awk '{printf "%s\t%s\n", $2,$3}' | sort > ucsc.ensembl1.tab
 
     ~/kent/src/hg/utils/automation/chromAlias.pl
 
     hgLoadSqlTab danRer10 chromAlias ~/kent/src/hg/lib/chromAlias.sql \
         danRer10.chromAlias.tab
 
-#########################################################################
+##############################################################################
+# crispr whole genome (DONE - 2023-11-04 - Hiram)
+    # redmine issue 21863: https://redmine.soe.ucsc.edu/issues/21863
+
+    mkdir /hive/data/genomes/danRer10/bed/crisprAll
+    cd /hive/data/genomes/danRer10/bed/crisprAll
+
+    # make sure it can get started
+    time (~/kent/src/hg/utils/automation/doCrispr.pl \
+      -stop=guides -buildDir=`pwd` -smallClusterHub=hgwdev danRer10) \
+           > guides.log 2>&1
+    # real    30m39.710s
+    sed -e 's/^/# /;' guides/run.time
+# Completed: 99 of 99 jobs
+# CPU time in finished jobs:       5510s      91.83m     1.53h    0.06d  0.000 y
+# IO & Wait Time:                   273s       4.55m     0.08h    0.00d  0.000 y
+# Average job time:                  58s       0.97m     0.02h    0.00d
+# Longest finished job:              96s       1.60m     0.03h    0.00d
+# Submission to last job:           105s       1.75m     0.03h    0.00d
+
+    # looks good, let it run through the load:
+    time ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScoreJobList \
+        -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev danRer10) \
+           > load.log 2>&1
+    # real    2523m24.976s
+
+    sed -e 's/^/# /;' specScores/run.time
+# Completed: 840309 of 840309 jobs
+# CPU time in finished jobs:   46880766s  781346.10m 13022.43h  542.60d  1.487 y
+# IO & Wait Time:               1164411s   19406.85m   323.45h   13.48d  0.037 y
+# Average job time:                  57s       0.95m     0.02h    0.00d
+# Longest finished job:             145s       2.42m     0.04h    0.00d
+# Submission to last job:        116299s    1938.32m    32.31h    1.35d
+
+    sed -e 's/^/# /;'  effScores/run.time
+# Completed: 9536 of 9536 jobs
+# CPU time in finished jobs:    4820886s   80348.09m  1339.13h   55.80d  0.153 y
+# IO & Wait Time:                 41241s     687.36m    11.46h    0.48d  0.001 y
+# Average job time:                 510s       8.50m     0.14h    0.01d
+# Longest finished job:            2057s      34.28m     0.57h    0.02d
+# Submission to last job:          9265s     154.42m     2.57h    0.11d
+
+    sed -e 's/^/# /;'  offTargets/run.time
+# Completed: 42016 of 42016 jobs
+# CPU time in finished jobs:     558233s    9303.88m   155.06h    6.46d  0.018 y
+# IO & Wait Time:                309956s    5165.94m    86.10h    3.59d  0.010 y
+# Average job time:                  21s       0.34m     0.01h    0.00d
+# Longest finished job:              34s       0.57m     0.01h    0.00d
+# Submission to last job:          1102s      18.37m     0.31h    0.01d
+
+
+    # that made the table crispr10K and symlinks in /gbdb/danRer10/crisrp10K/
+    # when it should have been instead crisprAll, reset the links and reload
+    # the correct table:
+mkdir -p /gbdb/danRer10/crisprAll/
+rm -f /gbdb/danRer10/crisprAll/crispr.bb
+rm -f /gbdb/danRer10/crisprAll/crisprDetails.tab
+ln -sf `pwd`/crispr.bb /gbdb/danRer10/crisprAll/crispr.bb
+ln -sf `pwd`/crisprDetails.tab /gbdb/danRer10/crisprAll/crisprDetails.tab
+hgBbiDbLink danRer10 crisprAllTargets /gbdb/danRer10/crisprAll/crispr.bb
+
+    hgsql -e 'drop table crispr10K;' danRer10
+
+    grep -c . effScores.tab
+    # 95378380
+    grep -c . specScores.tab
+    # 61805075
+
+    time (cut -f1-3 crispr.bed | bedSingleCover.pl stdin \
+       | awk '{sum+=$3-$2}END{printf "%d bases\n", sum}') \
+            > coverage.crispr.bed.txt 2>&1
+    936176533 bases
+    real    4m42.959s
+    ave -col=2 ../../*.sizes | grep total
+    total 1371719383.000000
+    # 'featureBits' result:
+    echo "scale+=3; 100.0 * 936176533 / 1371719383" | bc
+    68.248
+
+##############################################################################