ab2397a17e14b1ba1c9deb7481eadfc08d58b966
galt
  Fri Jul 2 11:32:20 2021 -0700
mm10.2bit and tables on RR, update GRCIncident db

diff --git src/hg/makeDb/doc/mm10.patchUpdate.6.txt src/hg/makeDb/doc/mm10.patchUpdate.6.txt
index 8018b6b..3e3f35e 100644
--- src/hg/makeDb/doc/mm10.patchUpdate.6.txt
+++ src/hg/makeDb/doc/mm10.patchUpdate.6.txt
@@ -1,661 +1,661 @@
 # for emacs: -*- mode: sh; -*-
 
 # This file describes how mm10 was extended with patch sequences and annotations from grcM38P6
 
 ##############################################################################
 # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2021-04-08 - Galt)
 
 
     cd /hive/data/genomes/mm10
     # main 2bit
     time faToTwoBit <(twoBitToFa mm10.2bit stdout) \
            <(twoBitToFa /hive/data/genomes/grcM38P6/grcM38P6.2bit stdout) \
            mm10.p6.2bit
 #real    1m52.859s
 
     # unmasked 2bit
     time twoBitMask -type=.bed mm10.p6.2bit /dev/null mm10.p6.unmasked.2bit
 #real    0m3.104s
 
     # chrom.sizes
     sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcM38P6/chrom.sizes > chrom.sizes.p6
     # chromInfo
     cd /hive/data/genomes/mm10/bed/chromInfo
     awk '{print $1 "\t" $2 "\t/gbdb/mm10/mm10.2bit";}' ../../chrom.sizes.p6 \
       > chromInfo.p6.tab
     wc -l chromInfo*.tab
 #  239 chromInfo.p6.tab
 #   66 chromInfo.tab
 
 
     # Install
     cd /hive/data/genomes/mm10
 
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv mm10.2bit mm10.initial.2bit
     mv mm10.unmasked.2bit mm10.initial.unmasked.2bit
     mv chrom.sizes chrom.sizes.initial
     # End of first-update-only stuff
 
     ln -sf mm10.p6.2bit mm10.2bit
     ln -sf mm10.p6.unmasked.2bit mm10.unmasked.2bit
     ln -sf chrom.sizes.p6 chrom.sizes
 
     cd /hive/data/genomes/mm10/bed/chromInfo
     hgLoadSqlTab mm10 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql chromInfo.p6.tab
 
 
 ##############################################################################
 # Extend main database tables for fileless tracks (DONE - 2021-04-08 - Galt)
     # Just add the patch table rows to the main database tables
     for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do
       echo $table
       hgsql mm10 -e "insert into mm10.$table select * from grcM38P6.$table"
     done
 
 
 ##############################################################################
 # Extend main database gc5BaseBw.bw (DONE - 2021-04-10 - Galt)
 
     cd /hive/data/genomes/mm10/bed/gc5Base/
     # Concatenate original assembly results with grcM38P6 results
     time (zcat mm10.gc5Base.wigVarStep.gz \
         /hive/data/genomes/grcM38P6/bed/gc5Base/grcM38P6.gc5Base.wigVarStep.gz \
       | gzip -c \
       > mm10.p6.gc5Base.wigVarStep.gz)
 #real    5m33.429s
 
     # Make a new gc5BaseBw.bw
     time wigToBigWig mm10.p6.gc5Base.wigVarStep.gz ../../chrom.sizes.p6 \
       mm10.p6.gc5Base.bw
 #real    11m51.723s
 
     # Install
     cd /hive/data/genomes/mm10/bed/gc5Base/
 
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv mm10.gc5Base.wigVarStep.gz mm10.initial.gc5Base.wigVarStep.gz
     mv mm10.gc5Base.bw mm10.initial.gc5Base.bw
     # Not used since bigWig makes wiggle obsolete, but set aside.
     mv mm10.gc5Base.wib mm10.initial.gc5Base.wib
     mv mm10.gc5Base.wig.gz mm10.initial.gc5Base.wig.gz
     # The .wib and .wig.gz are obsolete, remove them from the downloads.
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wib
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wig.gz
     # End of first-update-only stuff
 
     ln -sf mm10.p6.gc5Base.wigVarStep.gz mm10.gc5Base.wigVarStep.gz
     ln -sf mm10.p6.gc5Base.bw mm10.gc5Base.bw
 
     # Because of this symlink, browser track gc5BaseBw has been automatically updated:
     # /gbdb/mm10/bbi/gc5Base.bw -> /cluster/data/mm10/bed/gc5Base/mm10.gc5Base.bw
 
 
 
 
 ##############################################################################
 # Extend main database download files (DONE - 2021-04-19 - Galt)
 
     cd /hive/data/genomes/mm10/goldenPath/bigZips
 
 
     # FIRST TIME ONLY SECTION
     # mm10 was made so long ago that several things are missing from downloads.
 
     # mm10.agp.gz was missing, so here it is.
     cat /hive/data/genomes/mm10/mm10.agp | gzip -c > mm10.agp.gz
     # link the agp to downloads
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.agp.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.agp.gz
 
     # mm10.fa.gz was missing, so here it is.
     twoBitToFa ../../mm10.initial.2bit stdout \
     | gzip -c > mm10.fa.gz
 
     # mm10.fa.masked.gz was missing, so here it is.
     twoBitToFa ../../mm10.initial.2bit stdout \
     | maskOutFa stdin hard stdout \
     | gzip -c > mm10.fa.masked.gz
     # link the fa.masked.gz to downloads
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.masked.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.masked.gz
 
     # mm10.fa.out.gz RepeatMasker .out was missing:
     rm -rf out && mkdir out && cd out
     tar xvzf ../chromOut.tar.gz
     head -3 1/chr1.fa.out > ../mm10.fa.out
     for f in */*.fa.out; do
       tail -n +4 $f >> ../mm10.fa.out
     done
     gzip ../mm10.fa.out 
     cd ..
     rm -r out
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.out.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.out.gz
 
     # mm10.trf.bed.gz TRF output was missing:
     rm -rf trfMaskChrom
     rm -f m10.trf.bed
     tar xvzf chromTrf.tar.gz
     cd trfMaskChrom
     for f in *.bed; do
       cat $f >> ../mm10.trf.bed
     done
     gzip ../mm10.trf.bed
     cd ..
     rm -r trfMaskChrom
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.trf.bed.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.trf.bed.gz
 
     # RepeatMasker .align file was missing:
     ln -s /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz .
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.align.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.align.gz
 
     # END FIRST TIME ONLY SECTION
 
     mkdir p6
     # mm10.2bit and chrom.sizes were already extended above.
     ln -sf /hive/data/genomes/mm10/mm10.p6.2bit p6/
     ln -sf /hive/data/genomes/mm10/chrom.sizes.p6 p6/mm10.p6.chrom.sizes
 
     # AGP:
     zcat mm10.agp.gz \
          /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz \
     | grep -v ^# \
     | gzip -c > p6/mm10.p6.agp.gz
 
     # FASTA
     twoBitToFa ../../mm10.p6.2bit stdout \
     | gzip -c > p6/mm10.p6.fa.gz
 
     twoBitToFa ../../mm10.p6.2bit stdout \
     | maskOutFa stdin hard stdout \
     | gzip -c > p6/mm10.p6.fa.masked.gz
 
 
     # RepeatMasker (don't include header of patch file):
     cat <(zcat mm10.fa.out.gz) \
         <(zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz | tail -n +4) \
     | gzip -c > p6/mm10.p6.fa.out.gz
 
     # SimpleRepeats/TRF:
     zcat mm10.trf.bed.gz \
          /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz \
     | gzip -c > p6/mm10.p6.trf.bed.gz
     # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase:
     zcat mm10.trf.bed.gz | cut -f 1 | uniq | wc -l
 #62
     zcat p6/mm10.p6.trf.bed.gz | cut -f 1 | uniq | wc -l
 #235
 
     # mm10 also has download files with the old tar-bundle structure -- update those too.
     # Per-chrom AGP:
     rm -rf agp && mkdir agp && cd agp
     tar xvzf ../chromAgp.tar.gz
 
     splitFileByColumn -chromDirs -ending=.agp \
       /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz .
     tar cvzf ../p6/mm10.p6.chromAgp.tar.gz *
     cd ..
     rm -r agp
 
     # Per-chrom soft-masked FASTA:
     rm -rf chroms && mkdir chroms && cd chroms
     tar xvzf ../chromFa.tar.gz
     cd ..
     faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.gz chroms/
     ls -1 chroms | wc -l
 #239
     tar cvzf p6/mm10.p6.chromFa.tar.gz ./chroms
     rm -r chroms
 
     # Per-chrom hard-masked FASTA:
     rm -rf maskedChroms && mkdir maskedChroms && cd maskedChroms
     tar xvzf ../chromFaMasked.tar.gz
     cd ..
     faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.masked.gz maskedChroms/
     cd maskedChroms
     for f in *.fa; do
       mv $f $f.masked
     done
     cd ..
     ls -1 maskedChroms | wc -l
 #239
     tar cvzf p6/mm10.p6.chromFaMasked.tar.gz ./maskedChroms
     rm -r maskedChroms
 
     # Per-chrom RepeatMasker .out:
     rm -rf out && mkdir out && cd out
     tar xvzf ../chromOut.tar.gz
     zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \
     | head -3 > RepeatMaskerHeader.txt
     zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \
     | tail -n +4 \
     | splitFileByColumn -col=5 -chromDirs -head=RepeatMaskerHeader.txt -ending=.out \
       stdin .
     rm RepeatMaskerHeader.txt
     tar cvzf ../p6/mm10.p6.chromOut.tar.gz *
     cd ..
     rm -r out
 
     # Per-chrom TRF output:
     rm -rf trfMaskChrom
     tar xvzf chromTrf.tar.gz
     cd trfMaskChrom
     splitFileByColumn -ending=.bed \
       /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz .
     cd ..
     tar cvzf p6/mm10.p6.chromTrf.tar.gz ./trfMaskChrom
     rm -rf trfMaskChrom
 
     # RepeatMasker .align files:
     zcat mm10.fa.align.gz /hive/data/genomes/grcM38P6/bed/repeatMasker/grcM38P6.fa.align.gz \
     | gzip -c > p6/mm10.p6.fa.align.gz
 
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.bw p6/mm10.p6.gc5Base.bw
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.wigVarStep.gz p6/mm10.p6.gc5Base.wigVarStep.gz
 
     # TODO: regenerate upstream* files for p6
           # note skipping since these were never updated for hg19 and hg38.
 
     # Make new md5sum.txt
     cd p6
     md5sum mm10.* > md5sum.txt
 
     # p6 is now the latest
     # Update latest subdir
     cd /hive/data/genomes/mm10/goldenPath/bigZips
     mv latest latest.bak
     mkdir latest
     cd latest
     for f in ../p6/*; do
       noV=$(basename $(echo $f | sed -re 's/\.p6//;'))
       ln -s $f $noV
     done
     rm md5sum.txt
     cat ../p6/md5sum.txt | sed -e 's/\.p6//;' > md5sum.txt
     echo "GRCm38.p6" > LATEST_VERSION
     cd ..
     rm -rf latest.bak
 
 
     # Install
     cd /hive/data/genomes/mm10/goldenPath/bigZips
 
     # For the first update only, move initial release files to initial/.  Don't do this next update!
     mkdir initial
     mv chrom* mm10.* up* md5sum.txt initial/
     ln -sf /hive/data/genomes/mm10/mm10.initial.2bit initial/mm10.2bit
     ln -sf /hive/data/genomes/mm10/chrom.sizes.initial initial/mm10.chrom.sizes
     ln -sf /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz initial/mm10.fa.align.gz
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.bw initial/mm10.gc5Base.bw
     ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.wigVarStep.gz initial/mm10.gc5Base.wigVarStep.gz
     # Make new md5sum.txt  # since we created many new files
     cd initial
     md5sum chrom* mm10.* up* > md5sum.txt
     cd ..
     # Replace top-level files with links to files
     ln -sf initial/* .
     # End of first-update-only stuff
 
     # Edit README.txt
     cp README.txt README.txt.1
     vi README.txt
 
     # Update /htdocs-hgdownload files with links
     ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/
 
     rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/README.txt.*
 
     # TODO: /hive/data/genomes/mm10/goldenPath/chromosomes/
           # note skipping since these were never updated for hg19 and hg38.
 
 
 
 #############################################################################
 # NO analysisSet
 #   Because NCBI defines the analysis sets for full and no_alts for hg19 and hg38,
 #   yet no such sets exist for mouse mm10.p6 or mm39, there is no need for the analysis set.
 #############################################################################
 
 
 
 #############################################################################
 # Build perSeqMax file for gfServer (hgBlat) (DONE 2021-04-20 Galt)
     # When the blat server is restarted with the updated mm10.2bit file,
     # mm10.altsAndFixes needs to be copied over along with the new mm10.2bit file,
     # and gfServer needs to be restarted with -perSeqMax=mm10.altsAndFixes.
     cd /hive/data/genomes/mm10
     cut -f 1 chrom.sizes.p6 \
     | grep -E '_(alt|fix|hap.*)$' \
     | sed -re 's/^/mm10.2bit:/;' \
       > mm10.altsAndFixes.p6
     # Link for blat server installation convenience:
     ln -sf mm10.altsAndFixes.p6 altsAndFixes
 
 
 #############################################################################
 # Extend cytoBandIdeo (DONE 2021-04-20 Galt)
     cd /hive/data/genomes/mm10/bed/cytoband
     tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcM38P6/chrom.sizes \
       > cytoBand.p6.tab
     hgLoadSqlTab -oldTable mm10 cytoBandIdeo - cytoBand.p6.tab
 
 
 #########################################################################
 # Regenerate idKeys with extended mm10 (DONE - 2021-04-20 - Galt)
     mkdir /hive/data/genomes/mm10/bed/idKeys.p6
     cd /hive/data/genomes/mm10/bed/idKeys.p6
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \
       -twoBit=/hive/data/genomes/mm10/mm10.p6.unmasked.2bit \
       -bigClusterHub=ku -smallClusterHub=ku \
         -buildDir=`pwd` mm10) > do.log 2>&1 &
     tail -f do.log
 #real    0m53.546s
     cat mm10.keySignature.txt
 #b0ae7eaccca6031259f2c64be217338f
 
     # Install
     # For the first update only, move initial release files to .initial.  Don't do this next update!
     mv /hive/data/genomes/mm10/bed/idKeys{,.initial}
 
     cd /hive/data/genomes/mm10/bed/
     rm -f idKeys
     ln -s idKeys.p6 idKeys
 
 
 ##############################################################################
 # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 2021-04-21 Galt)
 
     # need to have idKeys for the genbank and refseq assemblies:
     mkdir -p /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6
     cd /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6
 
     # NOTE genbank subversion is .8 but refseq subversion is .26
 
     ln -s /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_genomic.fna.gz .
     faToTwoBit GCA_000001635.8_GRCm38.p6_genomic.fna.gz genbankP6.2bit
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP6.2bit \
         -bigClusterHub=ku -smallClusterHub=ku \
         genbankP6) > do.log 2>&1
 #real    0m58.734s
 
     mkdir /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6
     cd /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6
     ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz ./
     faToTwoBit GCF_000001635.26_GRCm38.p6_genomic.fna.gz refseqP6.2bit
     time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP6.2bit \
         -bigClusterHub=ku -smallClusterHub=ku \
         refseqP6) > do.log 2>&1
 #real    0m55.019s
 
     # with the three idKeys available, join them to make the table bed files:
     cd /hive/data/genomes/mm10/bed/ucscToINSDC
     sed -re 's/gi\|[0-9]+\|gb\|([A-Z0-9.]+)\|/\1/' genbankP6/genbankP6.idKeys.txt \
     | join -t$'\t' ../idKeys/mm10.idKeys.txt - \
     | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
     | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
     | sort -k1,1 -k2,2n > ucscToINSDC.p6.bed
 
     join -t$'\t' ../idKeys/mm10.idKeys.txt refseqP6/refseqP6.idKeys.txt \
     | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \
     | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \
     | sort -k1,1 -k2,2n > ucscToRefSeq.p6.bed
 
     # loading tables:
     export db=mm10
 
     export chrSize=`cut -f1 ucscToINSDC.p6.bed | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
     | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p6.bed
 
     export chrSize=`cut -f1 ucscToRefSeq.p6.bed | awk '{print length($0)}' | sort -n | tail -1`
     sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \
     | sed -e 's/INSDC/RefSeq/g;' \
     | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p6.bed
 
     # must be exactly 100% coverage
     featureBits -countGaps ${db} ucscToINSDC
 #2818974548 bases of 2818974548 (100.000%) in intersection
 
     # except for chrM (no refSeq):
     featureBits -countGaps ${db} ucscToRefSeq
 #2818974548 bases of 2818974548 (100.000%) in intersection
 
     # construct chromAlias:
     cd /hive/data/genomes/mm10/bed/chromAlias
     hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \
     | sort -k1,1 > ucsc.refseq.p6.tab
     hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \
     | sort -k1,1 > ucsc.genbank.p6.tab
     # add NCBI sequence names from assembly report
     grep -v ^# \
       /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_report.txt \
     | tawk '{print $5, $1;}' | sort \
       > genbankToAssembly.txt
     tawk '{print $2, $1;}' ucsc.genbank.p6.tab | sort \
     | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \
     | sort -k1,1 > ucsc.assembly.p6.tab
 
     ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.p6.tab \
     | sed -re 's/\.p6//;' \
         > ${db}.chromAlias.p6.tab
 
     # verify all there:
     for t in refseq genbank assembly
 do
   c0=`cat ucsc.$t.p6.tab | wc -l`
   c1=`grep $t mm10.chromAlias.p6.tab | wc -l`
   ok="OK"
   if [ "$c0" -ne "$c1" ]; then
      ok="ERROR"
   fi
   printf "# checking $t: $c0 =? $c1 $ok\n"
 done
 # checking refseq: 239 =? 239 OK
 # checking genbank: 239 =? 239 OK
 # checking assembly: 239 =? 239 OK
 
     hgLoadSqlTab mm10 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.p6.tab
 
 
 ############################################################################
 # altLocations and fixLocations (DONE - 2021-04-21 - Galt)
 
     # indicate corresponding locations between haplotypes and reference
 
     mkdir /hive/data/genomes/mm10/bed/altLocations.p6
     cd /hive/data/genomes/mm10/bed/altLocations.p6
 
     # NOTE below the ALT_* directories ONLY appear if the initial genome release had Alt haps.  mm10 did NOT.
     #  {ALT_*,PATCHES}
 
     # NOTE I modified this one to treat mm10 like hg19, i.e. haplotypes IDs have no subversion e.g. v1, v2, v3 ...
     # I also added some code to handle mouse scaffold MMCHR pattern. Also, hg19 users lower case ids, but not mm10.
     ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl 
     # I committed and pushed my changes.
 
     ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \
       -db=mm10 \
       /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alt_scaffold_placement.txt \
     | sort -k1,1 -k2n,2n \
       > altAndFixLocations.bed
     wc -l altAndFixLocations.bed
 #148 altAndFixLocations.bed
     grep _alt altAndFixLocations.bed > altLocations.bed
     grep _fix altAndFixLocations.bed > fixLocations.bed
     hgLoadBed mm10 altLocations{,.bed}
 #Read 20 elements of size 4 from altLocations.bed
     hgLoadBed mm10 fixLocations{,.bed}
 #Read 130 elements of size 4 from fixLocations.bed
     featureBits -countGaps mm10 altLocations
 #6171331 bases of 2818974548 (0.219%) in intersection
     featureBits -countGaps mm10 fixLocations
 #45238554 bases of 2818974548 (1.605%) in intersection
 
 #############################################################################
 # Check for new chrX alts/patches to add to par 
 # The mouse PAR is not as easy to characterize as the human PARs, skipping.
 
 
 ##############################################################################
 # altSeqLiftOver (DONE 2021-04-23 Galt)
     mkdir /hive/data/genomes/mm10/bed/altSeqLiftOver.p6
     cd /hive/data/genomes/mm10/bed/altSeqLiftOver.p6
     # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names
     hgsql mm10 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \
     | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed
     cp /dev/null altToChrom.noScore.psl
     for f in /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alignments/*.gff; do
       e=$(basename $f .gff | sed -e 's/_/|/g;')
       s=$(grep -E $e gbToUcsc.sed)
       sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \
         | pslPosTarget stdin stdout \
         >> altToChrom.noScore.psl
     done
     pslCheck altToChrom.noScore.psl
 #checked: 90 failed: 0 errors: 0
     time pslRecalcMatch altToChrom.noScore.psl ../../mm10.2bit{,} altToChrom.psl
 #real    0m35.138s
     pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl
     sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \
       > altAndPatches.psl
     grep _alt altAndPatches.psl > altSeqLiftOver.psl
     grep _fix altAndPatches.psl > fixSeqLiftOver.psl
 
 
     # Load tables
     hgLoadPsl mm10 -table=altSeqLiftOverPsl altSeqLiftOver.psl
     hgLoadPsl mm10 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl
 
     # Make chrom-to-alt PSL file for genbank process.
     ln -f -s `pwd`/chromToAlt.psl \
       /hive/data/genomes/mm10/jkStuff/mm10.p6.alt.psl
 
     # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences
     # Exclude alts that were already in mm10 before p6.
     cut -f 1 ../../chrom.sizes.initial | grep _ \
     | grep -vwf - chromToAlt.psl \
     | pslToChain stdin stdout \
     | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToPatch.p6.over.chain
     grep chain ../../jkStuff/mm10.mainToPatch.p6.over.chain | wc -l
 #90
 
     # also make a liftOver that includes the original alts, for tracks that have
     # annotations only on main chromosomes.  Exclude alt-to-fix alignments.
     # This is necessary only for the first time we add a patch update.
     awk '($14 !~ /_/)' chromToAlt.psl \
     | pslToChain stdin stdout \
     | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain
     grep chain ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain | wc -l
 #88
 
 
 #########################################################################
 # ncbiRefSeq.p6 Genes (DONE - 2021-04-22 - Galt)
 
     mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22
     cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22
 
     time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \
       GCF_000001635.26_GRCm38.p6 mm10) > do.log 2>&1 &
     tail -f do.log
 # *** All done !  Elapsed time: 5m14s
 #real    11m4.338s
 
     cat fb.ncbiRefSeq.mm10.txt
 #123082179 bases of 2739603606 (4.493%) in intersection
 
 ##############################################################################
 # Extend ENCODE Registry of Candidate cis-Regulatory Elements (DONE - 2021-05-26 - Galt)
 #
 #
 # From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab)
 # Data contacts:  Henry Pratt, Jill Moore, Zhiping Weng PI
 #
 # RM #24668
 #
 #   http://gcp.wenglab.org/hubs/integrative1/data/mm10/cta/mm10-ccres.bigbed
 # for Scores
 #   https://users.wenglab.org/moorej3/mouse-maxz-dnase.txt.gz  
 
 cd /hive/data/outside/encode3/ccre/mouse
 
 f=encodeCcreCombined
 lib=~/kent/src/hg/lib
 
 # input is $f.bed
 liftOver -tab -multiple -bedPlus=9 -noSerial $f.bed \
   /hive/data/genomes/mm10/jkStuff/mm10.mainToAllAltPatch.p6.over.chain \
   $f.p6.bed /dev/null
 sort -k1,1 -k2n,2n $f.bed $f.p6.bed > $f.plusP6.bed
 
 bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.plusP6.bed /hive/data/genomes/mm10/chrom.sizes $f.bb
 
 ln -fs `pwd`/$f.bb /gbdb/mm10/encode3/ccre
 
 
 
 ##############################################################################
 # Extend Single Cell RNA-Seq Gene Expression from Tabula Muris  (DONE - 2021-06-15 - Galt)
 #
 # REJECTED this is Max's track and he did not think it is worth the effort.
 
 
 ##############################################################################
-# GRC Incident Database (TODO - 2021-06-15 - Galt)
+# GRC Incident Database (DONE - 2021-06-15 - Galt)
 
     # Wait until the updated mm10 files have been pushed to RR because GRC Incident update is
     # automated.  Then update the file used to map GRC's RefSeq accessions to our names:
     hgsql mm10 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \
       > /hive/data/outside/grc/incidentDb/GRCm38/refSeq.chromNames.tab
 
 
 ##############################################################################
 We do not have to liftoever wgEncodeGencode* tables.
 They are automatically updated by some process,
 yet neither hg19 nor hg38 have Gencode data on their non-initial alts,
 so we do not have to do it either.
 
 
 ##############################################################################
 # PUSH TABLES and FILES
 
 
 Use this RM for the push.
  https://redmine.soe.ucsc.edu/issues/25045
 We do not use my qaPushQ tool anymore.
 
 Note the downloads should be getting synced by QA.
 Include these download files for qa pushing:
 rsync
 hgwdev:/data/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/
 to
 hgdownload:/home/qateam/htdocs/goldenPath/mm10/bigZips/
 
 push to hgnfs1 or beta (and rr)?
 /gbdb/mm10/mm10.2bit
 
 mm10 tables to push:
 
 gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene
 
 gc5Base   # may have some dependent files or .bw that should get pushed with it?
    /gbdb/mm10/bbi/gc5Base.bw 
     push to hgnfs1 or beta (and rr)?
 
 chromInfo
 chromAlias
 cytoBandIdeo
 altLocations
 fixLocations
 altSeqLiftOverPsl
 fixSeqLiftOverPsl
 
 other files, not download?
 
 combined cCREs track lifted
 /gbdb/mm10/encode3/ccre/encodeCcreCombined.bb
 
 mm10.2bit # have on download, # TODO blat new version to update
 mm10.altsAndFixes.p6
  add to startBlat.pl:  -perSeqMax=mm10.altsAndFixes
 
 
 ##############################################################################