src/hg/makeDb/doc/mm10.patchUpdate.6.txt dac1d9e2d193f587f633dcfb619103d8dd86d47c

dac1d9e2d193f587f633dcfb619103d8dd86d47c
galt
  Fri Jul 2 13:05:28 2021 -0700
adding text to mm10 description.html about the inclusion of alternate scaffolds on other mouse strains, part of the NCBI mm38 release.

diff --git src/hg/makeDb/doc/mm10.patchUpdate.6.txt src/hg/makeDb/doc/mm10.patchUpdate.6.txt
index 7a9beb9..d4201ac 100644
--- src/hg/makeDb/doc/mm10.patchUpdate.6.txt
+++ src/hg/makeDb/doc/mm10.patchUpdate.6.txt
@@ -1,684 +1,702 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how mm10 was extended with patch sequences and annotations from grcM38P6
 
 ALTS FROM OTHER MOUSE STRAINS IN NCBI RELEASE CONSIDERATIONS
 
 The original NBCI release grcM38 (which we used for the initial mm10 release)
-had dozens alt-scaffolds on 14 mouse strains. Whoever did that assembly manually removed
-those sequences from other strains. When we ran the patch6, we went back to the NCBI source
-and ran our standard build tools. We did not realize that 99 out of 108 scaffolds were
+has dozens of alt-scaffolds on 14 mouse strains. Whoever did that assembly manually removed
+those sequences from other strains from the mm10 ucsc initial release. 
+When we ran the patch6, we went back to the NCBI source
+and ran our standard assembly build pipeline. We did not realize that 99 out of 108 scaffolds were
 from the other 14 strains. It did have 9 alt-scaffolds for the native strain C57BL/6J too.
 We did not catch the issue until too late when QA had pushed already and we received message from a researcher.
 
 Since it would be a lot of work to go back and re-do all the patch6 without the extra mouse strain alts,
 we have decided to proceed. We have updated README and the mm10 main html page
 to reflect these changes and note the additional non-native strain sequences that appear in patch 6 release.
 This can be justified since we are having to deal with alt scaffolds anyway
-in our increasingly complex world, and this makes our release more similar to NCBIs.
+in our increasingly complex world, and this makes our release more similar to NCBI's release.
 Those alt-scaffolds were chosen because they can be useful, e.g. genes from other strains
 that have important medical research.
 
 Currently we have no table to map the alts to their respective strains,
 but it is easy to tell the native from non-native alts since
 all the IDS for the native C57BL/6J alts have the letter K in them.
 Our convention is that the ID follows the chrom they are located on,
 so the native alts look like chrN_KK* or chrN_KZ*.
 
+ALTs that are native to C57BL/6J the strain used for mm10.
+[mm10]> select * from chromInfo where chrom like "%_K%_alt";
++--------------------+--------+----------------------+
+| chrom              | size   | fileName             |
++--------------------+--------+----------------------+
+| chr1_KK082441_alt  | 456798 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289080_alt | 394982 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289074_alt | 394026 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289078_alt | 390920 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289081_alt | 369973 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289079_alt | 368967 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289075_alt | 322221 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289073_alt | 215264 | /gbdb/mm10/mm10.2bit |
+| chr11_KZ289077_alt | 186144 | /gbdb/mm10/mm10.2bit |
++--------------------+--------+----------------------+
+
+
 ##############################################################################
 # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2021-04-08 - Galt)
 
 
     cd /hive/data/genomes/mm10
     # main 2bit
     time faToTwoBit <(twoBitToFa mm10.2bit stdout) \
            <(twoBitToFa /hive/data/genomes/grcM38P6/grcM38P6.2bit stdout) \
            mm10.p6.2bit
 #real    1m52.859s
 
     # unmasked 2bit
     time twoBitMask -type=.bed mm10.p6.2bit /dev/null mm10.p6.unmasked.2bit
 #real    0m3.104s
 
     # chrom.sizes
     sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcM38P6/chrom.sizes > chrom.sizes.p6
     # chromInfo
     cd /hive/data/genomes/mm10/bed/chromInfo
     awk '{print $1 "\t" $2 "\t/gbdb/mm10/mm10.2bit";}' ../../chrom.sizes.p6 \
       > chromInfo.p6.tab
     wc -l chromInfo*.tab
 #  239 chromInfo.p6.tab
 #   66 chromInfo.tab
 
 
     # Install
     cd /hive/data/genomes/mm10
 
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv mm10.2bit mm10.initial.2bit
     mv mm10.unmasked.2bit mm10.initial.unmasked.2bit
     mv chrom.sizes chrom.sizes.initial
     # End of first-update-only stuff
 
     ln -sf mm10.p6.2bit mm10.2bit
     ln -sf mm10.p6.unmasked.2bit mm10.unmasked.2bit
     ln -sf chrom.sizes.p6 chrom.sizes
 
     cd /hive/data/genomes/mm10/bed/chromInfo
     hgLoadSqlTab mm10 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql chromInfo.p6.tab
 
 
 ##############################################################################
 # Extend main database tables for fileless tracks (DONE - 2021-04-08 - Galt)
     # Just add the patch table rows to the main database tables
     for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do
       echo $table
       hgsql mm10 -e "insert into mm10.$table select * from grcM38P6.$table"
     done
 
 
 ##############################################################################
 # Extend main database gc5BaseBw.bw (DONE - 2021-04-10 - Galt)
 
     cd /hive/data/genomes/mm10/bed/gc5Base/
     # Concatenate original assembly results with grcM38P6 results
     time (zcat mm10.gc5Base.wigVarStep.gz \
         /hive/data/genomes/grcM38P6/bed/gc5Base/grcM38P6.gc5Base.wigVarStep.gz \
       | gzip -c \
       > mm10.p6.gc5Base.wigVarStep.gz)
 #real    5m33.429s
 
     # Make a new gc5BaseBw.bw
     time wigToBigWig mm10.p6.gc5Base.wigVarStep.gz ../../chrom.sizes.p6 \
       mm10.p6.gc5Base.bw
 #real    11m51.723s
 
     # Install
     cd /hive/data/genomes/mm10/bed/gc5Base/
 
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv mm10.gc5Base.wigVarStep.gz mm10.initial.gc5Base.wigVarStep.gz
     mv mm10.gc5Base.bw mm10.initial.gc5Base.bw
     # Not used since bigWig makes wiggle obsolete, but set aside.
     mv mm10.gc5Base.wib mm10.initial.gc5Base.wib
     mv mm10.gc5Base.wig.gz mm10.initial.gc5Base.wig.gz
     # The .wib and .wig.gz are obsolete, remove them from the downloads.
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wib
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wig.gz
     # End of first-update-only stuff
 
     ln -sf mm10.p6.gc5Base.wigVarStep.gz mm10.gc5Base.wigVarStep.gz
     ln -sf mm10.p6.gc5Base.bw mm10.gc5Base.bw
 
     # Because of this symlink, browser track gc5BaseBw has been automatically updated:
     # /gbdb/mm10/bbi/gc5Base.bw -> /cluster/data/mm10/bed/gc5Base/mm10.gc5Base.bw
 
 
 
 
 ##############################################################################
 # Extend main database download files (DONE - 2021-04-19 - Galt)
 
     cd /hive/data/genomes/mm10/goldenPath/bigZips
 
 
     # FIRST TIME ONLY SECTION
     # mm10 was made so long ago that several things are missing from downloads.
 
     # mm10.agp.gz was missing, so here it is.
     cat /hive/data/genomes/mm10/mm10.agp | gzip -c > mm10.agp.gz
     # link the agp to downloads
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.agp.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.agp.gz
 
     # mm10.fa.gz was missing, so here it is.
     twoBitToFa ../../mm10.initial.2bit stdout \
     | gzip -c > mm10.fa.gz
 
     # mm10.fa.masked.gz was missing, so here it is.
     twoBitToFa ../../mm10.initial.2bit stdout \
     | maskOutFa stdin hard stdout \
     | gzip -c > mm10.fa.masked.gz
     # link the fa.masked.gz to downloads
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.masked.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.masked.gz
 
     # mm10.fa.out.gz RepeatMasker .out was missing:
     rm -rf out && mkdir out && cd out
     tar xvzf ../chromOut.tar.gz
     head -3 1/chr1.fa.out > ../mm10.fa.out
     for f in */*.fa.out; do
       tail -n +4 $f >> ../mm10.fa.out
     done
     gzip ../mm10.fa.out 
     cd ..
     rm -r out
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.out.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.out.gz
 
     # mm10.trf.bed.gz TRF output was missing:
     rm -rf trfMaskChrom
     rm -f m10.trf.bed
     tar xvzf chromTrf.tar.gz
     cd trfMaskChrom
     for f in *.bed; do
       cat $f >> ../mm10.trf.bed
     done
     gzip ../mm10.trf.bed
     cd ..
     rm -r trfMaskChrom
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.trf.bed.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.trf.bed.gz
 
     # RepeatMasker .align file was missing:
     ln -s /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz .
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.align.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.align.gz
 
     # END FIRST TIME ONLY SECTION
 
     mkdir p6
     # mm10.2bit and chrom.sizes were already extended above.
     ln -sf /hive/data/genomes/mm10/mm10.p6.2bit p6/
     ln -sf /hive/data/genomes/mm10/chrom.sizes.p6 p6/mm10.p6.chrom.sizes
 
     # AGP:
     zcat mm10.agp.gz \
          /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz \
     | grep -v ^# \
     | gzip -c > p6/mm10.p6.agp.gz
 
     # FASTA
     twoBitToFa ../../mm10.p6.2bit stdout \
     | gzip -c > p6/mm10.p6.fa.gz
 
     twoBitToFa ../../mm10.p6.2bit stdout \
     | maskOutFa stdin hard stdout \
     | gzip -c > p6/mm10.p6.fa.masked.gz
 
 
     # RepeatMasker (don't include header of patch file):
     cat <(zcat mm10.fa.out.gz) \
         <(zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz | tail -n +4) \
     | gzip -c > p6/mm10.p6.fa.out.gz
 
     # SimpleRepeats/TRF:
     zcat mm10.trf.bed.gz \
          /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz \
     | gzip -c > p6/mm10.p6.trf.bed.gz
     # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase:
     zcat mm10.trf.bed.gz | cut -f 1 | uniq | wc -l
 #62
     zcat p6/mm10.p6.trf.bed.gz | cut -f 1 | uniq | wc -l
 #235
 
     # mm10 also has download files with the old tar-bundle structure -- update those too.
     # Per-chrom AGP:
     rm -rf agp && mkdir agp && cd agp
     tar xvzf ../chromAgp.tar.gz
 
     splitFileByColumn -chromDirs -ending=.agp \
       /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz .
     tar cvzf ../p6/mm10.p6.chromAgp.tar.gz *
     cd ..
     rm -r agp
 
     # Per-chrom soft-masked FASTA:
     rm -rf chroms && mkdir chroms && cd chroms
     tar xvzf ../chromFa.tar.gz
     cd ..
     faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.gz chroms/
     ls -1 chroms | wc -l
 #239
     tar cvzf p6/mm10.p6.chromFa.tar.gz ./chroms
     rm -r chroms
 
     # Per-chrom hard-masked FASTA:
     rm -rf maskedChroms && mkdir maskedChroms && cd maskedChroms
     tar xvzf ../chromFaMasked.tar.gz
     cd ..
     faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.masked.gz maskedChroms/
     cd maskedChroms
     for f in *.fa; do
       mv $f $f.masked
     done
     cd ..
     ls -1 maskedChroms | wc -l
 #239
     tar cvzf p6/mm10.p6.chromFaMasked.tar.gz ./maskedChroms
     rm -r maskedChroms
 
     # Per-chrom RepeatMasker .out:
     rm -rf out && mkdir out && cd out
     tar xvzf ../chromOut.tar.gz
     zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \
     | head -3 > RepeatMaskerHeader.txt
     zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \
     | tail -n +4 \
     | splitFileByColumn -col=5 -chromDirs -head=RepeatMaskerHeader.txt -ending=.out \
       stdin .
     rm RepeatMaskerHeader.txt
     tar cvzf ../p6/mm10.p6.chromOut.tar.gz *
     cd ..
     rm -r out
 
     # Per-chrom TRF output:
     rm -rf trfMaskChrom
     tar xvzf chromTrf.tar.gz
     cd trfMaskChrom
     splitFileByColumn -ending=.bed \
       /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz .
     cd ..
     tar cvzf p6/mm10.p6.chromTrf.tar.gz ./trfMaskChrom
     rm -rf trfMaskChrom
 
     # RepeatMasker .align files:
     zcat mm10.fa.align.gz /hive/data/genomes/grcM38P6/bed/repeatMasker/grcM38P6.fa.align.gz \
     | gzip -c > p6/mm10.p6.fa.align.gz
 
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.bw p6/mm10.p6.gc5Base.bw
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.wigVarStep.gz p6/mm10.p6.gc5Base.wigVarStep.gz
 
     # TODO: regenerate upstream* files for p6
           # note skipping since these were never updated for hg19 and hg38.
 
     # Make new md5sum.txt
     cd p6
     md5sum mm10.* > md5sum.txt
 
     # p6 is now the latest
     # Update latest subdir
     cd /hive/data/genomes/mm10/goldenPath/bigZips
     mv latest latest.bak
     mkdir latest
     cd latest
     for f in ../p6/*; do
       noV=$(basename $(echo $f | sed -re 's/\.p6//;'))
       ln -s $f $noV
     done
     rm md5sum.txt
     cat ../p6/md5sum.txt | sed -e 's/\.p6//;' > md5sum.txt
     echo "GRCm38.p6" > LATEST_VERSION
     cd ..
     rm -rf latest.bak
 
 
     # Install
     cd /hive/data/genomes/mm10/goldenPath/bigZips
 
     # For the first update only, move initial release files to initial/.  Don't do this next update!
     mkdir initial
     mv chrom* mm10.* up* md5sum.txt initial/
     ln -sf /hive/data/genomes/mm10/mm10.initial.2bit initial/mm10.2bit
     ln -sf /hive/data/genomes/mm10/chrom.sizes.initial initial/mm10.chrom.sizes
     ln -sf /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz initial/mm10.fa.align.gz
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.bw initial/mm10.gc5Base.bw
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.wigVarStep.gz initial/mm10.gc5Base.wigVarStep.gz
     # Make new md5sum.txt  # since we created many new files
     cd initial
     md5sum chrom* mm10.* up* > md5sum.txt
     cd ..
     # Replace top-level files with links to files
     ln -sf initial/* .
     # End of first-update-only stuff
 
     # Edit README.txt
     cp README.txt README.txt.1
     vi README.txt
 
     # Update /htdocs-hgdownload files with links
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/
 
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/README.txt.*
 
     # TODO: /hive/data/genomes/mm10/goldenPath/chromosomes/
           # note skipping since these were never updated for hg19 and hg38.
 
 
 
 #############################################################################
 # NO analysisSet
 #   Because NCBI defines the analysis sets for full and no_alts for hg19 and hg38,
 #   yet no such sets exist for mouse mm10.p6 or mm39, there is no need for the analysis set.
 #############################################################################
 
 
 
 #############################################################################
 # Build perSeqMax file for gfServer (hgBlat) (DONE 2021-04-20 Galt)
     # When the blat server is restarted with the updated mm10.2bit file,
     # mm10.altsAndFixes needs to be copied over along with the new mm10.2bit file,
     # and gfServer needs to be restarted with -perSeqMax=mm10.altsAndFixes.
     cd /hive/data/genomes/mm10
     cut -f 1 chrom.sizes.p6 \
     | grep -E '_(alt|fix|hap.*)$' \
     | sed -re 's/^/mm10.2bit:/;' \
       > mm10.altsAndFixes.p6
     # Link for blat server installation convenience:
     ln -sf mm10.altsAndFixes.p6 altsAndFixes
 
 
 #############################################################################
 # Extend cytoBandIdeo (DONE 2021-04-20 Galt)
     cd /hive/data/genomes/mm10/bed/cytoband
     tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcM38P6/chrom.sizes \
       > cytoBand.p6.tab
     hgLoadSqlTab -oldTable mm10 cytoBandIdeo - cytoBand.p6.tab
 
 
 #########################################################################
 # Regenerate idKeys with extended mm10 (DONE - 2021-04-20 - Galt)
     mkdir /hive/data/genomes/mm10/bed/idKeys.p6
     cd /hive/data/genomes/mm10/bed/idKeys.p6
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \
       -twoBit=/hive/data/genomes/mm10/mm10.p6.unmasked.2bit \
       -bigClusterHub=ku -smallClusterHub=ku \
         -buildDir=`pwd` mm10) > do.log 2>&1 &
     tail -f do.log
 #real    0m53.546s
     cat mm10.keySignature.txt
 #b0ae7eaccca6031259f2c64be217338f
 
     # Install
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv /hive/data/genomes/mm10/bed/idKeys{,.initial}
 
     cd /hive/data/genomes/mm10/bed/
     rm -f idKeys
     ln -s idKeys.p6 idKeys
 
 
 ##############################################################################
 # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 2021-04-21 Galt)
 
     # need to have idKeys for the genbank and refseq assemblies:
     mkdir -p /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6
     cd /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6
 
     # NOTE genbank subversion is .8 but refseq subversion is .26
 
     ln -s /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_genomic.fna.gz .
     faToTwoBit GCA_000001635.8_GRCm38.p6_genomic.fna.gz genbankP6.2bit
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP6.2bit \
         -bigClusterHub=ku -smallClusterHub=ku \
         genbankP6) > do.log 2>&1
 #real    0m58.734s
 
     mkdir /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6
     cd /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6
     ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz ./
     faToTwoBit GCF_000001635.26_GRCm38.p6_genomic.fna.gz refseqP6.2bit
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP6.2bit \
         -bigClusterHub=ku -smallClusterHub=ku \
         refseqP6) > do.log 2>&1
 #real    0m55.019s
 
     # with the three idKeys available, join them to make the table bed files:
     cd /hive/data/genomes/mm10/bed/ucscToINSDC
     sed -re 's/gi\|[0-9]+\|gb\|([A-Z0-9.]+)\|/\1/' genbankP6/genbankP6.idKeys.txt \
     | join -t$'\t' ../idKeys/mm10.idKeys.txt - \
     | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
     | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
     | sort -k1,1 -k2,2n > ucscToINSDC.p6.bed
 
     join -t$'\t' ../idKeys/mm10.idKeys.txt refseqP6/refseqP6.idKeys.txt \
     | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
     | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
     | sort -k1,1 -k2,2n > ucscToRefSeq.p6.bed
 
     # loading tables:
     export db=mm10
 
     export chrSize=`cut -f1 ucscToINSDC.p6.bed | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
     | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p6.bed
 
     export chrSize=`cut -f1 ucscToRefSeq.p6.bed | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
     | sed -e 's/INSDC/RefSeq/g;' \
     | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p6.bed
 
     # must be exactly 100% coverage
     featureBits -countGaps ${db} ucscToINSDC
 #2818974548 bases of 2818974548 (100.000%) in intersection
 
     # except for chrM (no refSeq):
     featureBits -countGaps ${db} ucscToRefSeq
 #2818974548 bases of 2818974548 (100.000%) in intersection
 
     # construct chromAlias:
     cd /hive/data/genomes/mm10/bed/chromAlias
     hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \
     | sort -k1,1 > ucsc.refseq.p6.tab
     hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \
     | sort -k1,1 > ucsc.genbank.p6.tab
     # add NCBI sequence names from assembly report
     grep -v ^# \
       /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_report.txt \
     | tawk '{print $5, $1;}' | sort \
       > genbankToAssembly.txt
     tawk '{print $2, $1;}' ucsc.genbank.p6.tab | sort \
     | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \
     | sort -k1,1 > ucsc.assembly.p6.tab
 
     ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.p6.tab \
     | sed -re 's/\.p6//;' \
         > ${db}.chromAlias.p6.tab
 
     # verify all there:
     for t in refseq genbank assembly
 do
   c0=`cat ucsc.$t.p6.tab | wc -l`
   c1=`grep $t mm10.chromAlias.p6.tab | wc -l`
   ok="OK"
   if [ "$c0" -ne "$c1" ]; then
      ok="ERROR"
   fi
   printf "# checking $t: $c0 =? $c1 $ok\n"
 done
 # checking refseq: 239 =? 239 OK
 # checking genbank: 239 =? 239 OK
 # checking assembly: 239 =? 239 OK
 
     hgLoadSqlTab mm10 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.p6.tab
 
 
 ############################################################################
 # altLocations and fixLocations (DONE - 2021-04-21 - Galt)
 
     # indicate corresponding locations between haplotypes and reference
 
     mkdir /hive/data/genomes/mm10/bed/altLocations.p6
     cd /hive/data/genomes/mm10/bed/altLocations.p6
 
     # NOTE below the ALT_* directories ONLY appear if the initial genome release had Alt haps.  mm10 did NOT.
     #  {ALT_*,PATCHES}
 
     # NOTE I modified this one to treat mm10 like hg19, i.e. haplotypes IDs have no subversion e.g. v1, v2, v3 ...
     # I also added some code to handle mouse scaffold MMCHR pattern. Also, hg19 users lower case ids, but not mm10.
     ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl 
     # I committed and pushed my changes.
 
     ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \
       -db=mm10 \
       /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
     | sort -k1,1 -k2n,2n \
       > altAndFixLocations.bed
     wc -l altAndFixLocations.bed
 #148 altAndFixLocations.bed
     grep _alt altAndFixLocations.bed > altLocations.bed
     grep _fix altAndFixLocations.bed > fixLocations.bed
     hgLoadBed mm10 altLocations{,.bed}
 #Read 20 elements of size 4 from altLocations.bed
     hgLoadBed mm10 fixLocations{,.bed}
 #Read 130 elements of size 4 from fixLocations.bed
     featureBits -countGaps mm10 altLocations
 #6171331 bases of 2818974548 (0.219%) in intersection
     featureBits -countGaps mm10 fixLocations
 #45238554 bases of 2818974548 (1.605%) in intersection
 
 #############################################################################
 # Check for new chrX alts/patches to add to par 
 # The mouse PAR is not as easy to characterize as the human PARs, skipping.
 
 
 ##############################################################################
 # altSeqLiftOver (DONE 2021-04-23 Galt)
     mkdir /hive/data/genomes/mm10/bed/altSeqLiftOver.p6
     cd /hive/data/genomes/mm10/bed/altSeqLiftOver.p6
     # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names
     hgsql mm10 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \
     | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed
     cp /dev/null altToChrom.noScore.psl
     for f in /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alignments/*.gff; do
       e=$(basename $f .gff | sed -e 's/_/|/g;')
       s=$(grep -E $e gbToUcsc.sed)
       sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \
         | pslPosTarget stdin stdout \
         >> altToChrom.noScore.psl
     done
     pslCheck altToChrom.noScore.psl
 #checked: 90 failed: 0 errors: 0
     time pslRecalcMatch altToChrom.noScore.psl ../../mm10.2bit{,} altToChrom.psl
 #real    0m35.138s
     pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl
     sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \
       > altAndPatches.psl
     grep _alt altAndPatches.psl > altSeqLiftOver.psl
     grep _fix altAndPatches.psl > fixSeqLiftOver.psl
 
 
     # Load tables
     hgLoadPsl mm10 -table=altSeqLiftOverPsl altSeqLiftOver.psl
     hgLoadPsl mm10 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl
 
     # Make chrom-to-alt PSL file for genbank process.
     ln -f -s `pwd`/chromToAlt.psl \
       /hive/data/genomes/mm10/jkStuff/mm10.p6.alt.psl
 
     # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences
     # Exclude alts that were already in mm10 before p6.
     cut -f 1 ../../chrom.sizes.initial | grep _ \
     | grep -vwf - chromToAlt.psl \
     | pslToChain stdin stdout \
     | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToPatch.p6.over.chain
     grep chain ../../jkStuff/mm10.mainToPatch.p6.over.chain | wc -l
 #90
 
     # also make a liftOver that includes the original alts, for tracks that have
     # annotations only on main chromosomes.  Exclude alt-to-fix alignments.
     # This is necessary only for the first time we add a patch update.
     awk '($14 !~ /_/)' chromToAlt.psl \
     | pslToChain stdin stdout \
     | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain
     grep chain ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain | wc -l
 #88
 
 
 #########################################################################
 # ncbiRefSeq.p6 Genes (DONE - 2021-04-22 - Galt)
 
     mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22
     cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       GCF_000001635.26_GRCm38.p6 mm10) > do.log 2>&1 &
     tail -f do.log
 # *** All done !  Elapsed time: 5m14s
 #real    11m4.338s
 
     cat fb.ncbiRefSeq.mm10.txt
 #123082179 bases of 2739603606 (4.493%) in intersection
 
 ##############################################################################
 # Extend ENCODE Registry of Candidate cis-Regulatory Elements (DONE - 2021-05-26 - Galt)
 #
 #
 # From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab)
 # Data contacts:  Henry Pratt, Jill Moore, Zhiping Weng PI
 #
 # RM #24668
 #
 #   http://gcp.wenglab.org/hubs/integrative1/data/mm10/cta/mm10-ccres.bigbed
 # for Scores
 #   https://users.wenglab.org/moorej3/mouse-maxz-dnase.txt.gz  
 
 cd /hive/data/outside/encode3/ccre/mouse
 
 f=encodeCcreCombined
 lib=~/kent/src/hg/lib
 
 # input is $f.bed
 liftOver -tab -multiple -bedPlus=9 -noSerial $f.bed \
   /hive/data/genomes/mm10/jkStuff/mm10.mainToAllAltPatch.p6.over.chain \
   $f.p6.bed /dev/null
 sort -k1,1 -k2n,2n $f.bed $f.p6.bed > $f.plusP6.bed
 
 bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.plusP6.bed /hive/data/genomes/mm10/chrom.sizes $f.bb
 
 ln -fs `pwd`/$f.bb /gbdb/mm10/encode3/ccre
 
 
 
 ##############################################################################
 # Extend Single Cell RNA-Seq Gene Expression from Tabula Muris  (DONE - 2021-06-15 - Galt)
 #
 # REJECTED this is Max's track and he did not think it is worth the effort.
 
 
 ##############################################################################
 # GRC Incident Database (DONE - 2021-06-15 - Galt)
 
     # Wait until the updated mm10 files have been pushed to RR because GRC Incident update is
     # automated.  Then update the file used to map GRC's RefSeq accessions to our names:
     hgsql mm10 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \
       > /hive/data/outside/grc/incidentDb/GRCm38/refSeq.chromNames.tab
 
 
 ##############################################################################
 We do not have to liftoever wgEncodeGencode* tables.
 They are automatically updated by some process,
 yet neither hg19 nor hg38 have Gencode data on their non-initial alts,
 so we do not have to do it either.
 
 
 ##############################################################################
 # PUSH TABLES and FILES
 
 
 Use this RM for the push.
  https://redmine.soe.ucsc.edu/issues/25045
 We do not use my qaPushQ tool anymore.
 
 Note the downloads should be getting synced by QA.
 Include these download files for qa pushing:
 rsync
 hgwdev:/data/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/
 to
 hgdownload:/home/qateam/htdocs/goldenPath/mm10/bigZips/
 
 push to hgnfs1 or beta (and rr)?
 /gbdb/mm10/mm10.2bit
 
 mm10 tables to push:
 
 gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene
 
 gc5Base   # may have some dependent files or .bw that should get pushed with it?
    /gbdb/mm10/bbi/gc5Base.bw 
     push to hgnfs1 or beta (and rr)?
 
 chromInfo
 chromAlias
 cytoBandIdeo
 altLocations
 fixLocations
 altSeqLiftOverPsl
 fixSeqLiftOverPsl
 
 other files, not download?
 
 combined cCREs track lifted
 /gbdb/mm10/encode3/ccre/encodeCcreCombined.bb
 
 mm10.2bit # have on download, # TODO blat new version to update
 mm10.altsAndFixes.p6
  add to startBlat.pl:  -perSeqMax=mm10.altsAndFixes
 
 
 ##############################################################################