dac1d9e2d193f587f633dcfb619103d8dd86d47c galt Fri Jul 2 13:05:28 2021 -0700 adding text to mm10 description.html about the inclusion of alternate scaffolds on other mouse strains, part of the NCBI mm38 release. diff --git src/hg/makeDb/doc/mm10.patchUpdate.6.txt src/hg/makeDb/doc/mm10.patchUpdate.6.txt index 7a9beb9..d4201ac 100644 --- src/hg/makeDb/doc/mm10.patchUpdate.6.txt +++ src/hg/makeDb/doc/mm10.patchUpdate.6.txt @@ -1,684 +1,702 @@ # for emacs: -*- mode: sh; -*- # This file describes how mm10 was extended with patch sequences and annotations from grcM38P6 ALTS FROM OTHER MOUSE STRAINS IN NCBI RELEASE CONSIDERATIONS The original NBCI release grcM38 (which we used for the initial mm10 release) -had dozens alt-scaffolds on 14 mouse strains. Whoever did that assembly manually removed -those sequences from other strains. When we ran the patch6, we went back to the NCBI source -and ran our standard build tools. We did not realize that 99 out of 108 scaffolds were +has dozens of alt-scaffolds on 14 mouse strains. Whoever did that assembly manually removed +those sequences from other strains from the mm10 ucsc initial release. +When we ran the patch6, we went back to the NCBI source +and ran our standard assembly build pipeline. We did not realize that 99 out of 108 scaffolds were from the other 14 strains. It did have 9 alt-scaffolds for the native strain C57BL/6J too. We did not catch the issue until too late when QA had pushed already and we received message from a researcher. Since it would be a lot of work to go back and re-do all the patch6 without the extra mouse strain alts, we have decided to proceed. We have updated README and the mm10 main html page to reflect these changes and note the additional non-native strain sequences that appear in patch 6 release. This can be justified since we are having to deal with alt scaffolds anyway -in our increasingly complex world, and this makes our release more similar to NCBIs. +in our increasingly complex world, and this makes our release more similar to NCBI's release. Those alt-scaffolds were chosen because they can be useful, e.g. genes from other strains that have important medical research. Currently we have no table to map the alts to their respective strains, but it is easy to tell the native from non-native alts since all the IDS for the native C57BL/6J alts have the letter K in them. Our convention is that the ID follows the chrom they are located on, so the native alts look like chrN_KK* or chrN_KZ*. +ALTs that are native to C57BL/6J the strain used for mm10. +[mm10]> select * from chromInfo where chrom like "%_K%_alt"; ++--------------------+--------+----------------------+ +| chrom | size | fileName | ++--------------------+--------+----------------------+ +| chr1_KK082441_alt | 456798 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289080_alt | 394982 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289074_alt | 394026 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289078_alt | 390920 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289081_alt | 369973 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289079_alt | 368967 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289075_alt | 322221 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289073_alt | 215264 | /gbdb/mm10/mm10.2bit | +| chr11_KZ289077_alt | 186144 | /gbdb/mm10/mm10.2bit | ++--------------------+--------+----------------------+ + + ############################################################################## # Extend main database 2bit, chrom.sizes, chromInfo (DONE - 2021-04-08 - Galt) cd /hive/data/genomes/mm10 # main 2bit time faToTwoBit <(twoBitToFa mm10.2bit stdout) \ <(twoBitToFa /hive/data/genomes/grcM38P6/grcM38P6.2bit stdout) \ mm10.p6.2bit #real 1m52.859s # unmasked 2bit time twoBitMask -type=.bed mm10.p6.2bit /dev/null mm10.p6.unmasked.2bit #real 0m3.104s # chrom.sizes sort -k2nr,2nr chrom.sizes /hive/data/genomes/grcM38P6/chrom.sizes > chrom.sizes.p6 # chromInfo cd /hive/data/genomes/mm10/bed/chromInfo awk '{print $1 "\t" $2 "\t/gbdb/mm10/mm10.2bit";}' ../../chrom.sizes.p6 \ > chromInfo.p6.tab wc -l chromInfo*.tab # 239 chromInfo.p6.tab # 66 chromInfo.tab # Install cd /hive/data/genomes/mm10 # For the first update only, move initial release files to .initial. Don't do this next update! mv mm10.2bit mm10.initial.2bit mv mm10.unmasked.2bit mm10.initial.unmasked.2bit mv chrom.sizes chrom.sizes.initial # End of first-update-only stuff ln -sf mm10.p6.2bit mm10.2bit ln -sf mm10.p6.unmasked.2bit mm10.unmasked.2bit ln -sf chrom.sizes.p6 chrom.sizes cd /hive/data/genomes/mm10/bed/chromInfo hgLoadSqlTab mm10 chromInfo $HOME/kent/src/hg/lib/chromInfo.sql chromInfo.p6.tab ############################################################################## # Extend main database tables for fileless tracks (DONE - 2021-04-08 - Galt) # Just add the patch table rows to the main database tables for table in gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene; do echo $table hgsql mm10 -e "insert into mm10.$table select * from grcM38P6.$table" done ############################################################################## # Extend main database gc5BaseBw.bw (DONE - 2021-04-10 - Galt) cd /hive/data/genomes/mm10/bed/gc5Base/ # Concatenate original assembly results with grcM38P6 results time (zcat mm10.gc5Base.wigVarStep.gz \ /hive/data/genomes/grcM38P6/bed/gc5Base/grcM38P6.gc5Base.wigVarStep.gz \ | gzip -c \ > mm10.p6.gc5Base.wigVarStep.gz) #real 5m33.429s # Make a new gc5BaseBw.bw time wigToBigWig mm10.p6.gc5Base.wigVarStep.gz ../../chrom.sizes.p6 \ mm10.p6.gc5Base.bw #real 11m51.723s # Install cd /hive/data/genomes/mm10/bed/gc5Base/ # For the first update only, move initial release files to .initial. Don't do this next update! mv mm10.gc5Base.wigVarStep.gz mm10.initial.gc5Base.wigVarStep.gz mv mm10.gc5Base.bw mm10.initial.gc5Base.bw # Not used since bigWig makes wiggle obsolete, but set aside. mv mm10.gc5Base.wib mm10.initial.gc5Base.wib mv mm10.gc5Base.wig.gz mm10.initial.gc5Base.wig.gz # The .wib and .wig.gz are obsolete, remove them from the downloads. rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wib rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.gc5Base.wig.gz # End of first-update-only stuff ln -sf mm10.p6.gc5Base.wigVarStep.gz mm10.gc5Base.wigVarStep.gz ln -sf mm10.p6.gc5Base.bw mm10.gc5Base.bw # Because of this symlink, browser track gc5BaseBw has been automatically updated: # /gbdb/mm10/bbi/gc5Base.bw -> /cluster/data/mm10/bed/gc5Base/mm10.gc5Base.bw ############################################################################## # Extend main database download files (DONE - 2021-04-19 - Galt) cd /hive/data/genomes/mm10/goldenPath/bigZips # FIRST TIME ONLY SECTION # mm10 was made so long ago that several things are missing from downloads. # mm10.agp.gz was missing, so here it is. cat /hive/data/genomes/mm10/mm10.agp | gzip -c > mm10.agp.gz # link the agp to downloads ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.agp.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.agp.gz # mm10.fa.gz was missing, so here it is. twoBitToFa ../../mm10.initial.2bit stdout \ | gzip -c > mm10.fa.gz # mm10.fa.masked.gz was missing, so here it is. twoBitToFa ../../mm10.initial.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > mm10.fa.masked.gz # link the fa.masked.gz to downloads ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.masked.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.masked.gz # mm10.fa.out.gz RepeatMasker .out was missing: rm -rf out && mkdir out && cd out tar xvzf ../chromOut.tar.gz head -3 1/chr1.fa.out > ../mm10.fa.out for f in */*.fa.out; do tail -n +4 $f >> ../mm10.fa.out done gzip ../mm10.fa.out cd .. rm -r out ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.out.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.out.gz # mm10.trf.bed.gz TRF output was missing: rm -rf trfMaskChrom rm -f m10.trf.bed tar xvzf chromTrf.tar.gz cd trfMaskChrom for f in *.bed; do cat $f >> ../mm10.trf.bed done gzip ../mm10.trf.bed cd .. rm -r trfMaskChrom ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.trf.bed.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.trf.bed.gz # RepeatMasker .align file was missing: ln -s /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz . ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/mm10.fa.align.gz /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/mm10.fa.align.gz # END FIRST TIME ONLY SECTION mkdir p6 # mm10.2bit and chrom.sizes were already extended above. ln -sf /hive/data/genomes/mm10/mm10.p6.2bit p6/ ln -sf /hive/data/genomes/mm10/chrom.sizes.p6 p6/mm10.p6.chrom.sizes # AGP: zcat mm10.agp.gz \ /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz \ | grep -v ^# \ | gzip -c > p6/mm10.p6.agp.gz # FASTA twoBitToFa ../../mm10.p6.2bit stdout \ | gzip -c > p6/mm10.p6.fa.gz twoBitToFa ../../mm10.p6.2bit stdout \ | maskOutFa stdin hard stdout \ | gzip -c > p6/mm10.p6.fa.masked.gz # RepeatMasker (don't include header of patch file): cat <(zcat mm10.fa.out.gz) \ <(zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz | tail -n +4) \ | gzip -c > p6/mm10.p6.fa.out.gz # SimpleRepeats/TRF: zcat mm10.trf.bed.gz \ /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz \ | gzip -c > p6/mm10.p6.trf.bed.gz # We don't expect a complete set of chroms to have simpleRepeats, but at least an increase: zcat mm10.trf.bed.gz | cut -f 1 | uniq | wc -l #62 zcat p6/mm10.p6.trf.bed.gz | cut -f 1 | uniq | wc -l #235 # mm10 also has download files with the old tar-bundle structure -- update those too. # Per-chrom AGP: rm -rf agp && mkdir agp && cd agp tar xvzf ../chromAgp.tar.gz splitFileByColumn -chromDirs -ending=.agp \ /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.agp.gz . tar cvzf ../p6/mm10.p6.chromAgp.tar.gz * cd .. rm -r agp # Per-chrom soft-masked FASTA: rm -rf chroms && mkdir chroms && cd chroms tar xvzf ../chromFa.tar.gz cd .. faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.gz chroms/ ls -1 chroms | wc -l #239 tar cvzf p6/mm10.p6.chromFa.tar.gz ./chroms rm -r chroms # Per-chrom hard-masked FASTA: rm -rf maskedChroms && mkdir maskedChroms && cd maskedChroms tar xvzf ../chromFaMasked.tar.gz cd .. faSplit byname /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.masked.gz maskedChroms/ cd maskedChroms for f in *.fa; do mv $f $f.masked done cd .. ls -1 maskedChroms | wc -l #239 tar cvzf p6/mm10.p6.chromFaMasked.tar.gz ./maskedChroms rm -r maskedChroms # Per-chrom RepeatMasker .out: rm -rf out && mkdir out && cd out tar xvzf ../chromOut.tar.gz zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \ | head -3 > RepeatMaskerHeader.txt zcat /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.fa.out.gz \ | tail -n +4 \ | splitFileByColumn -col=5 -chromDirs -head=RepeatMaskerHeader.txt -ending=.out \ stdin . rm RepeatMaskerHeader.txt tar cvzf ../p6/mm10.p6.chromOut.tar.gz * cd .. rm -r out # Per-chrom TRF output: rm -rf trfMaskChrom tar xvzf chromTrf.tar.gz cd trfMaskChrom splitFileByColumn -ending=.bed \ /hive/data/genomes/grcM38P6/goldenPath/bigZips/grcM38P6.trf.bed.gz . cd .. tar cvzf p6/mm10.p6.chromTrf.tar.gz ./trfMaskChrom rm -rf trfMaskChrom # RepeatMasker .align files: zcat mm10.fa.align.gz /hive/data/genomes/grcM38P6/bed/repeatMasker/grcM38P6.fa.align.gz \ | gzip -c > p6/mm10.p6.fa.align.gz ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.bw p6/mm10.p6.gc5Base.bw ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.p6.gc5Base.wigVarStep.gz p6/mm10.p6.gc5Base.wigVarStep.gz # TODO: regenerate upstream* files for p6 # note skipping since these were never updated for hg19 and hg38. # Make new md5sum.txt cd p6 md5sum mm10.* > md5sum.txt # p6 is now the latest # Update latest subdir cd /hive/data/genomes/mm10/goldenPath/bigZips mv latest latest.bak mkdir latest cd latest for f in ../p6/*; do noV=$(basename $(echo $f | sed -re 's/\.p6//;')) ln -s $f $noV done rm md5sum.txt cat ../p6/md5sum.txt | sed -e 's/\.p6//;' > md5sum.txt echo "GRCm38.p6" > LATEST_VERSION cd .. rm -rf latest.bak # Install cd /hive/data/genomes/mm10/goldenPath/bigZips # For the first update only, move initial release files to initial/. Don't do this next update! mkdir initial mv chrom* mm10.* up* md5sum.txt initial/ ln -sf /hive/data/genomes/mm10/mm10.initial.2bit initial/mm10.2bit ln -sf /hive/data/genomes/mm10/chrom.sizes.initial initial/mm10.chrom.sizes ln -sf /hive/data/genomes/mm10/bed/repeatMasker/mm10.fa.align.gz initial/mm10.fa.align.gz ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.bw initial/mm10.gc5Base.bw ln -sf /hive/data/genomes/mm10/bed/gc5Base/mm10.initial.gc5Base.wigVarStep.gz initial/mm10.gc5Base.wigVarStep.gz # Make new md5sum.txt # since we created many new files cd initial md5sum chrom* mm10.* up* > md5sum.txt cd .. # Replace top-level files with links to files ln -sf initial/* . # End of first-update-only stuff # Edit README.txt cp README.txt README.txt.1 vi README.txt # Update /htdocs-hgdownload files with links ln -sf /hive/data/genomes/mm10/goldenPath/bigZips/* /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/ rm /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/README.txt.* # TODO: /hive/data/genomes/mm10/goldenPath/chromosomes/ # note skipping since these were never updated for hg19 and hg38. ############################################################################# # NO analysisSet # Because NCBI defines the analysis sets for full and no_alts for hg19 and hg38, # yet no such sets exist for mouse mm10.p6 or mm39, there is no need for the analysis set. ############################################################################# ############################################################################# # Build perSeqMax file for gfServer (hgBlat) (DONE 2021-04-20 Galt) # When the blat server is restarted with the updated mm10.2bit file, # mm10.altsAndFixes needs to be copied over along with the new mm10.2bit file, # and gfServer needs to be restarted with -perSeqMax=mm10.altsAndFixes. cd /hive/data/genomes/mm10 cut -f 1 chrom.sizes.p6 \ | grep -E '_(alt|fix|hap.*)$' \ | sed -re 's/^/mm10.2bit:/;' \ > mm10.altsAndFixes.p6 # Link for blat server installation convenience: ln -sf mm10.altsAndFixes.p6 altsAndFixes ############################################################################# # Extend cytoBandIdeo (DONE 2021-04-20 Galt) cd /hive/data/genomes/mm10/bed/cytoband tawk '{print $1, 0, $2, "", "gneg";}' /hive/data/genomes/grcM38P6/chrom.sizes \ > cytoBand.p6.tab hgLoadSqlTab -oldTable mm10 cytoBandIdeo - cytoBand.p6.tab ######################################################################### # Regenerate idKeys with extended mm10 (DONE - 2021-04-20 - Galt) mkdir /hive/data/genomes/mm10/bed/idKeys.p6 cd /hive/data/genomes/mm10/bed/idKeys.p6 time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl \ -twoBit=/hive/data/genomes/mm10/mm10.p6.unmasked.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ -buildDir=`pwd` mm10) > do.log 2>&1 & tail -f do.log #real 0m53.546s cat mm10.keySignature.txt #b0ae7eaccca6031259f2c64be217338f # Install # For the first update only, move initial release files to .initial. Don't do this next update! mv /hive/data/genomes/mm10/bed/idKeys{,.initial} cd /hive/data/genomes/mm10/bed/ rm -f idKeys ln -s idKeys.p6 idKeys ############################################################################## # UCSC to RefSeq, INSDC, Assembly; chromAlias (DONE 2021-04-21 Galt) # need to have idKeys for the genbank and refseq assemblies: mkdir -p /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6 cd /hive/data/genomes/mm10/bed/ucscToINSDC/genbankP6 # NOTE genbank subversion is .8 but refseq subversion is .26 ln -s /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_genomic.fna.gz . faToTwoBit GCA_000001635.8_GRCm38.p6_genomic.fna.gz genbankP6.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=genbankP6.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ genbankP6) > do.log 2>&1 #real 0m58.734s mkdir /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6 cd /hive/data/genomes/mm10/bed/ucscToINSDC/refseqP6 ln -s /hive/data/outside/ncbi/genomes/refseq/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCF_000001635.26_GRCm38.p6/GCF_000001635.26_GRCm38.p6_genomic.fna.gz ./ faToTwoBit GCF_000001635.26_GRCm38.p6_genomic.fna.gz refseqP6.2bit time ($HOME/kent/src/hg/utils/automation/doIdKeys.pl -buildDir=`pwd` -twoBit=refseqP6.2bit \ -bigClusterHub=ku -smallClusterHub=ku \ refseqP6) > do.log 2>&1 #real 0m55.019s # with the three idKeys available, join them to make the table bed files: cd /hive/data/genomes/mm10/bed/ucscToINSDC sed -re 's/gi\|[0-9]+\|gb\|([A-Z0-9.]+)\|/\1/' genbankP6/genbankP6.idKeys.txt \ | join -t$'\t' ../idKeys/mm10.idKeys.txt - \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.p6.bed join -t$'\t' ../idKeys/mm10.idKeys.txt refseqP6/refseqP6.idKeys.txt \ | cut -f2- | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.p6.bed # loading tables: export db=mm10 export chrSize=`cut -f1 ucscToINSDC.p6.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ${db} ucscToINSDC stdin ucscToINSDC.p6.bed export chrSize=`cut -f1 ucscToRefSeq.p6.bed | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ${db} ucscToRefSeq stdin ucscToRefSeq.p6.bed # must be exactly 100% coverage featureBits -countGaps ${db} ucscToINSDC #2818974548 bases of 2818974548 (100.000%) in intersection # except for chrM (no refSeq): featureBits -countGaps ${db} ucscToRefSeq #2818974548 bases of 2818974548 (100.000%) in intersection # construct chromAlias: cd /hive/data/genomes/mm10/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ${db} \ | sort -k1,1 > ucsc.refseq.p6.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ${db} \ | sort -k1,1 > ucsc.genbank.p6.tab # add NCBI sequence names from assembly report grep -v ^# \ /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_report.txt \ | tawk '{print $5, $1;}' | sort \ > genbankToAssembly.txt tawk '{print $2, $1;}' ucsc.genbank.p6.tab | sort \ | join -t$'\t' -o 1.2,2.2 - genbankToAssembly.txt \ | sort -k1,1 > ucsc.assembly.p6.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.p6.tab \ | sed -re 's/\.p6//;' \ > ${db}.chromAlias.p6.tab # verify all there: for t in refseq genbank assembly do c0=`cat ucsc.$t.p6.tab | wc -l` c1=`grep $t mm10.chromAlias.p6.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 239 =? 239 OK # checking genbank: 239 =? 239 OK # checking assembly: 239 =? 239 OK hgLoadSqlTab mm10 chromAlias $HOME/kent/src/hg/lib/chromAlias.sql ${db}.chromAlias.p6.tab ############################################################################ # altLocations and fixLocations (DONE - 2021-04-21 - Galt) # indicate corresponding locations between haplotypes and reference mkdir /hive/data/genomes/mm10/bed/altLocations.p6 cd /hive/data/genomes/mm10/bed/altLocations.p6 # NOTE below the ALT_* directories ONLY appear if the initial genome release had Alt haps. mm10 did NOT. # {ALT_*,PATCHES} # NOTE I modified this one to treat mm10 like hg19, i.e. haplotypes IDs have no subversion e.g. v1, v2, v3 ... # I also added some code to handle mouse scaffold MMCHR pattern. Also, hg19 users lower case ids, but not mm10. ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl # I committed and pushed my changes. ~/kent/src/hg/utils/automation/altScaffoldPlacementToBed.pl \ -db=mm10 \ /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alt_scaffold_placement.txt \ | sort -k1,1 -k2n,2n \ > altAndFixLocations.bed wc -l altAndFixLocations.bed #148 altAndFixLocations.bed grep _alt altAndFixLocations.bed > altLocations.bed grep _fix altAndFixLocations.bed > fixLocations.bed hgLoadBed mm10 altLocations{,.bed} #Read 20 elements of size 4 from altLocations.bed hgLoadBed mm10 fixLocations{,.bed} #Read 130 elements of size 4 from fixLocations.bed featureBits -countGaps mm10 altLocations #6171331 bases of 2818974548 (0.219%) in intersection featureBits -countGaps mm10 fixLocations #45238554 bases of 2818974548 (1.605%) in intersection ############################################################################# # Check for new chrX alts/patches to add to par # The mouse PAR is not as easy to characterize as the human PARs, skipping. ############################################################################## # altSeqLiftOver (DONE 2021-04-23 Galt) mkdir /hive/data/genomes/mm10/bed/altSeqLiftOver.p6 cd /hive/data/genomes/mm10/bed/altSeqLiftOver.p6 # Use chromAlias to make a .sed file to substitute Genbank accessions to UCSC names hgsql mm10 -NBe 'select alias,chrom from chromAlias where find_in_set("genbank", source);' \ | awk '{print "s@" $1 "@" $2 "@;";}' > gbToUcsc.sed cp /dev/null altToChrom.noScore.psl for f in /hive/data/genomes/grcM38P6/genbank/GCA_000001635.8_GRCm38.p6_assembly_structure/PATCHES/alt_scaffolds/alignments/*.gff; do e=$(basename $f .gff | sed -e 's/_/|/g;') s=$(grep -E $e gbToUcsc.sed) sed -re "$s" $f | gff3ToPsl ../../chrom.sizes{,} stdin stdout \ | pslPosTarget stdin stdout \ >> altToChrom.noScore.psl done pslCheck altToChrom.noScore.psl #checked: 90 failed: 0 errors: 0 time pslRecalcMatch altToChrom.noScore.psl ../../mm10.2bit{,} altToChrom.psl #real 0m35.138s pslSwap altToChrom.psl stdout | pslPosTarget stdin chromToAlt.psl sort -k14,14 -k16n,16n -k10,10 -k12n,12n altToChrom.psl chromToAlt.psl \ > altAndPatches.psl grep _alt altAndPatches.psl > altSeqLiftOver.psl grep _fix altAndPatches.psl > fixSeqLiftOver.psl # Load tables hgLoadPsl mm10 -table=altSeqLiftOverPsl altSeqLiftOver.psl hgLoadPsl mm10 -table=fixSeqLiftOverPsl fixSeqLiftOver.psl # Make chrom-to-alt PSL file for genbank process. ln -f -s `pwd`/chromToAlt.psl \ /hive/data/genomes/mm10/jkStuff/mm10.p6.alt.psl # Make a liftOver chain file for mapping annotations on main chroms to new patch sequences # Exclude alts that were already in mm10 before p6. cut -f 1 ../../chrom.sizes.initial | grep _ \ | grep -vwf - chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToPatch.p6.over.chain grep chain ../../jkStuff/mm10.mainToPatch.p6.over.chain | wc -l #90 # also make a liftOver that includes the original alts, for tracks that have # annotations only on main chromosomes. Exclude alt-to-fix alignments. # This is necessary only for the first time we add a patch update. awk '($14 !~ /_/)' chromToAlt.psl \ | pslToChain stdin stdout \ | chainScore stdin ../../mm10.2bit{,} ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain grep chain ../../jkStuff/mm10.mainToAllAltPatch.p6.over.chain | wc -l #88 ######################################################################### # ncbiRefSeq.p6 Genes (DONE - 2021-04-22 - Galt) mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22 cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p6.2021-04-22 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ GCF_000001635.26_GRCm38.p6 mm10) > do.log 2>&1 & tail -f do.log # *** All done ! Elapsed time: 5m14s #real 11m4.338s cat fb.ncbiRefSeq.mm10.txt #123082179 bases of 2739603606 (4.493%) in intersection ############################################################################## # Extend ENCODE Registry of Candidate cis-Regulatory Elements (DONE - 2021-05-26 - Galt) # # # From ENCODE 3 Data Analysis Center at U Mass Med Center (Zlab) # Data contacts: Henry Pratt, Jill Moore, Zhiping Weng PI # # RM #24668 # # http://gcp.wenglab.org/hubs/integrative1/data/mm10/cta/mm10-ccres.bigbed # for Scores # https://users.wenglab.org/moorej3/mouse-maxz-dnase.txt.gz cd /hive/data/outside/encode3/ccre/mouse f=encodeCcreCombined lib=~/kent/src/hg/lib # input is $f.bed liftOver -tab -multiple -bedPlus=9 -noSerial $f.bed \ /hive/data/genomes/mm10/jkStuff/mm10.mainToAllAltPatch.p6.over.chain \ $f.p6.bed /dev/null sort -k1,1 -k2n,2n $f.bed $f.p6.bed > $f.plusP6.bed bedToBigBed -tab -type=bed9+6 -as=$lib/$f.as $f.plusP6.bed /hive/data/genomes/mm10/chrom.sizes $f.bb ln -fs `pwd`/$f.bb /gbdb/mm10/encode3/ccre ############################################################################## # Extend Single Cell RNA-Seq Gene Expression from Tabula Muris (DONE - 2021-06-15 - Galt) # # REJECTED this is Max's track and he did not think it is worth the effort. ############################################################################## # GRC Incident Database (DONE - 2021-06-15 - Galt) # Wait until the updated mm10 files have been pushed to RR because GRC Incident update is # automated. Then update the file used to map GRC's RefSeq accessions to our names: hgsql mm10 -NBe 'select alias,chrom from chromAlias where source = "refseq" order by alias;' \ > /hive/data/outside/grc/incidentDb/GRCm38/refSeq.chromNames.tab ############################################################################## We do not have to liftoever wgEncodeGencode* tables. They are automatically updated by some process, yet neither hg19 nor hg38 have Gencode data on their non-initial alts, so we do not have to do it either. ############################################################################## # PUSH TABLES and FILES Use this RM for the push. https://redmine.soe.ucsc.edu/issues/25045 We do not use my qaPushQ tool anymore. Note the downloads should be getting synced by QA. Include these download files for qa pushing: rsync hgwdev:/data/apache/htdocs-hgdownload/goldenPath/mm10/bigZips/ to hgdownload:/home/qateam/htdocs/goldenPath/mm10/bigZips/ push to hgnfs1 or beta (and rr)? /gbdb/mm10/mm10.2bit mm10 tables to push: gap gold rmsk simpleRepeat windowmaskerSdust cpgIslandExt genscan augustusGene gc5Base # may have some dependent files or .bw that should get pushed with it? /gbdb/mm10/bbi/gc5Base.bw push to hgnfs1 or beta (and rr)? chromInfo chromAlias cytoBandIdeo altLocations fixLocations altSeqLiftOverPsl fixSeqLiftOverPsl other files, not download? combined cCREs track lifted /gbdb/mm10/encode3/ccre/encodeCcreCombined.bb mm10.2bit # have on download, # TODO blat new version to update mm10.altsAndFixes.p6 add to startBlat.pl: -perSeqMax=mm10.altsAndFixes ##############################################################################