c6142e28bc43936027fcfa20acdea1cbf6b99209 hiram Mon Jul 27 15:27:42 2020 -0700 begin build of mm39 refs #22271 diff --git src/hg/makeDb/doc/mm39/initialBuild.txt src/hg/makeDb/doc/mm39/initialBuild.txt new file mode 100644 index 0000000..7659dfc --- /dev/null +++ src/hg/makeDb/doc/mm39/initialBuild.txt @@ -0,0 +1,1199 @@ +# for emacs: -*- mode: sh; -*- + +# This file describes browser build for the mm39 +# GCA_000001635.9_GRCm39 + +# Can use existing photograph (otherwise find one before starting here) + +######################################################################### +# Initial steps, reuse existing photograph (DONE - 2020-07-21 - Hiram) + +# To start this initialBuild.txt document, from a previous assembly document: + +mkdir ~/kent/src/hg/makeDb/doc/mm39 +cd ~/kent/src/hg/makeDb/doc/mm39 + +sed -e 's/canFam5/mm38/g; s/CanFam5/Mm39/g; s/DONE/TBD/g;' \ + ../canFam5/initialBuild.txt > initialBuild.txt + +mkdir -p /hive/data/genomes/mm39/genbank +cd /hive/data/genomes/mm39 + +# reuse existing photo from mm10: +cp -p ../mm10/photoReference.txt . + +cat photoReference..txt +photoCreditURL http://www.jax.org/ +photoCreditName Photo courtesy of The Jackson Laboratory + +## download from NCBI +cd /hive/data/genomes/mm39/genbank + +time rsync -L -a -P --stats \ +rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/000/001/635/GCA_000001635.9_GRCm39/ ./ +sent 3,157 bytes received 14,658,551,486 bytes 57,372,033.83 bytes/sec +total size is 14,654,961,664 speedup is 1.00 + +real 4m15.891s + +# this information is from the top of +# mm39/genbank/*_assembly_report.txt +# (aka: mm39/genbank/GCA_000001635.9_GRCm39_assembly_report.txt + +# Assembly name: GRCm39 +# Description: Genome Reference Consortium Mouse Build 39 +# Organism name: Mus musculus (house mouse) +# Infraspecific name: strain=C57BL/6J +# Taxid: 10090 +# BioProject: PRJNA20689 +# Submitter: Genome Reference Consortium +# Date: 2020-06-24 +# Assembly type: haploid +# Release type: major +# Assembly level: Chromosome +# Genome representation: full +# RefSeq category: Reference Genome +# GenBank assembly accession: GCA_000001635.9 +# +## Assembly-Units: +## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name +## GCA_000000055.3 Primary Assembly (C57BL/6J) +## GCA_000004175.1 non-nuclear + +# check assembly size for later reference: + +faSize G*m39_genomic.fna.gz + +# 2728222451 bases (73600668 N's 2654621783 real 1687364940 upper 967256843 lower) in 61 sequences in 1 files +# Total size: mean 44724958.2 sd 64970951.3 min 1976 (JH584295.1) max 195154279 (CM000994.3) median 182347 +# %35.45 masked total, %36.44 masked real + +# Survey types of gaps: + +zcat *gaps.txt.gz | cut -f5 | sort | uniq -c + 60 between_scaffolds + 20 centromere + 1 gap_type + 21 short_arm + 42 telomere + 23 unknown + 181 within_scaffold + +# And total size in gaps: +zgrep -v "^#" *gaps.txt.gz | awk '{print $3-$2+1}' | ave stdin \ + | sed -e 's/^/# /;' +# Q1 943.000000 +# median 50000.000000 +# Q3 68500.000000 +# average 212105.515850 +# min 10.000000 +# max 2890000.000000 +# count 347 +# total 73600614.000000 +# standard deviation 667296.516291 + +############################################################################# +# establish config.ra file (DONE - 2020-07-27 - Hiram) + cd /hive/data/genomes/mm39 + ~/kent/src/hg/utils/automation/prepConfig.pl mm39 mammal mouse \ + genbank/*_assembly_report.txt > mm39.config.ra + + # fix commonName: +commonName House mouse +to: +commonName Mouse + # fix orderKey: +orderKey 8694 +to +orderKey 268 + # fix assemblyLabel: +assemblyLabel Genome Reference Consortium +to +assemblyLabel Genome Reference Consortium Mouse Build 39 (GCA_000001635.9) + + # XXX THERE IS NO BIOSAMPLE !!! + + # compare with previous version to see if it is sane: + diff mm39.config.ra ../mm10/mm10.config.ra + + # verify it really does look sane + cat mm39.config.ra +# Config parameters for makeGenomeDb.pl: +db mm39 +clade mammal +scientificName Mus musculus +commonName Mouse +assemblyDate Jun. 2020 +assemblyLabel Genome Reference Consortium Mouse Build 39 (GCA_000001635.9) +assemblyShortLabel GRCm39 +orderKey 269 +# mitochondrial sequence included in refseq release +# mitoAcc AY172335.1 +mitoAcc none +fastaFiles /hive/data/genomes/mm39/ucsc/*.fa.gz +agpFiles /hive/data/genomes/mm39/ucsc/*.agp +# qualFiles none +dbDbSpeciesDir mouse +photoCreditURL http://www.jax.org/ +photoCreditName Photo courtesy of The Jackson Laboratory +ncbiGenomeId 52 +ncbiAssemblyId 7358741 +ncbiAssemblyName GRCm39 +ncbiBioProject 20689 +ncbiBioSample n/a +genBankAccessionID GCA_000001635.9 +taxId 10090 + +############################################################################# +# setup UCSC named files (DONE - 2020-07-25 - Hiram) + + mkdir /hive/data/genomes/mm39/ucsc + cd /hive/data/genomes/mm39/ucsc + + # check for duplicate sequences: + time faToTwoBit -noMask ../genbank/G*m39_genomic.fna.gz genbank.2bit + # real 0m36.427s + + twoBitDup genbank.2bit + # no output is a good result, otherwise, would have to eliminate duplicates + # the scripts creating the fasta here will be creating a refseq.2bit file + # to be removed later + + # compare gaps with what the gaps.gz file reported: + twoBitInfo -nBed genbank.2bit genbank.gap.bed + awk '{print $3-$2}' *.gap.bed | ave stdin | sed -e 's/^/# /;' +# Q1 100.000000 +# median 2151.000000 +# Q3 50000.000000 +# average 220361.281437 +# min 1.000000 +# max 3050000.000000 +# count 334 +# total 73600668.000000 +# standard deviation 717517.501122 + + # comparing with above, there are 54 bases here that are not + # counted in the NCBI gaps file. See what the AGP says later on here. + + time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ + ../genbank/G*m39_genomic.fna.gz \ + ../genbank/*_assembly_structure/Primary_Assembly +CM000994.3 chr1 +CM000995.3 chr2 +CM000996.3 chr3 +CM000997.3 chr4 +CM000998.3 chr5 +CM000999.3 chr6 +CM001000.3 chr7 +CM001001.3 chr8 +CM001002.3 chr9 +CM001003.3 chr10 +CM001004.3 chr11 +CM001005.3 chr12 +CM001006.3 chr13 +CM001007.3 chr14 +CM001008.3 chr15 +CM001009.3 chr16 +CM001010.3 chr17 +CM001011.3 chr18 +CM001012.3 chr19 +CM001013.3 chrX +CM001014.3 chrY + +real 11m14.469s + + time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ + ../genbank/*_assembly_structure/Primary_Assembly + # processed 21 sequences into chrUn.fa.gz + real 0m0.276s + + time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ + ../genbank/*_assembly_structure/Primary_Assembly +# 4 +# 1 +# X +# 7 +# Y +# 5 +# processed 18 sequences into chr*_random.gz 6 files + +# real 0m1.466s + + # bash syntax here + mitoAcc=`grep "^# mitoAcc" ../mm39.config.ra | awk '{print $NF}'` + printf "# mitoAcc %s\n" "$mitoAcc" +# mitoAcc AY172335.1 + + zcat \ + ../genbank/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \ + | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp + + cat chrM.agp +# chrM 1 16299 1 O AY172335.1 1 16299 + + + printf ">chrM\n" > chrM.fa + twoBitToFa -noMask genbank.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa + gzip chrM.fa + + faSize chrM.fa.gz +# 16299 bases (0 N's 16299 real 16299 upper 0 lower) in 1 sequences in 1 files + + # verify fasta and AGPs agree + time faToTwoBit *.fa.gz test.2bit + # real 0m47.200s + + cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 + # All AGP and FASTA entries agree - both files are valid + + # and no sequence lost from orginal: + twoBitToFa test.2bit stdout | faSize stdin +# 2728222451 bases (73600668 N's 2654621783 real 2654621783 upper 0 lower) +# in 61 sequences in 1 files +# Total size: mean 44724958.2 sd 64970951.3 min 1976 (chr4_JH584295v1_random) +# max 195154279 (chr1) median 182347 + + # same numbers as above (except for upper/lower masking) +# 2728222451 bases (73600668 N's 2654621783 real 1687364940 upper 967256843 lower) in 61 sequences in 1 files + + # See if the AGP files define all the gaps: + # categories of gaps: + awk '$5 == "N"' *.agp | cut -f7 | sort | uniq -c | sed -e 's/^/# /;' +# 20 centromere +# 60 contig +# 181 scaffold +# 21 short_arm +# 42 telomere + + awk '$5 == "N"' *.agp | awk '{print $3-$2+1}' | ave stdin \ + | sed -e 's/^/# /;' +# Q1 1373.000000 +# median 50000.000000 +# Q3 100000.000000 +# average 227155.228395 +# min 27.000000 +# max 2890000.000000 +# count 324 +# total 73598294.000000 +# standard deviation 688160.252488 + + # From the 2bit sequence, there are 10 more gaps and 2,374 more bases in gap: +# count 334 +# total 73600668.000000 + + # the gaps file defined: +# count 347 +# total 73600614.000000 + + # survey gap types from gap file + # the gaps file defines 23 more gaps than the AGP files, + # the gaps file defines 13 more gaps but 54 less bases than the sequence + # note the 'unknown' types (== 23 gaps) + zgrep -v "^#" ../genbank/*gaps* | cut -f5,6 | sort | uniq -c \ + | sed -e 's/^/# /;' +# 60 between_scaffolds na +# 20 centromere na +# 21 short_arm na +# 42 telomere na +# 4 unknown inferred_from_sequence +# 19 unknown unspecified +# 5 within_scaffold align_genus +# 36 within_scaffold map +# 96 within_scaffold paired-ends +# 44 within_scaffold unspecified + + # survey of AGP types of gaps: + # beware, can also be type U in col 5, doesn't happen here: + awk '$5 == "N"' *.agp | awk '{print $7,$NF}' | sort | uniq -c \ + | sed -e 's/^/# /;' +# 20 centromere na +# 60 contig na +# 5 scaffold align_genus +# 36 scaffold map +# 96 scaffold paired-ends +# 44 scaffold unspecified +# 21 short_arm na +# 42 telomere na + + # a chromosome to accession name correspondence can be extracted + # from these single line agp files: + zgrep -h -v "^#" chr*.agp | cut -f1,6 | sort > ucsc.ncbi.name.equivalence + # unfortunately, that is only one type of name correspondence. + # there are other names in the assembly report: + grep -v "^#" \ + ../genbank/GCA_005444595.1_UMICH_Zoey_3.1_assembly_report.txt \ + | awk '{printf "%s\t%s\n", $1,$5}' | sort > ncbi.assembly.name.equivalence + # some of those will match also. Make up a sed command file with + # the two different types of names: + join -t$'\t' ucsc.ncbi.name.equivalence ncbi.assembly.name.equivalence \ + | awk '{printf "s/%s/%s/;\n", $3,$1}' > ncbi.ucsc.sed + join -v1 -t$'\t' ucsc.ncbi.name.equivalence \ + ncbi.assembly.name.equivalence \ + | awk '{printf "s/%s/%s/;\n", $2, $1}' >> ncbi.ucsc.sed + + # no longer need these temporary 2bit files + rm test.2bit refseq.2bit genbank.2bit genbank.gap.bed + +############################################################################# +# Initial database build (DONE - 2020-07-27 - Hiram) + + # verify sequence and AGP are OK: + cd /hive/data/genomes/mm39 + time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -stop=agp mm39.config.ra) > agp.log 2>&1 + # real 2m18.928s + + # then finish it off: + time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ + -fileServer=hgwdev -continue=db mm39.config.ra) > db.log 2>&1 + # real 14m40.115s + + # check in the trackDb files created in TemporaryTrackDbCheckout/ + # and add mm39 to trackDb/makefile refs #22271 + # fixing up the images reference to mm39.jpg + + # temporary symlink until masked sequence is available + cd /hive/data/genomes/mm39 + ln -s `pwd`/mm39.unmasked.2bit /gbdb/mm39/mm39.2bit + +############################################################################# +# verify gap table vs NCBI gap file (TBD - 2020-07-17 - Hiram) + mkdir /hive/data/genomes/mm39/bed/gap + cd /hive/data/genomes/mm39/bed/gap + + zgrep -v "^#" ../../genbank/G*_gaps.txt.gz \ + | awk '{printf "%s\t%d\t%d\t%s_%s\n", $1,$2-1,$3,$5,$6}' \ + | sort -k1,1 -k2,2n > genbank.gap.bed + + # type survey: + cut -f4 *.bed | sort | uniq -c | sed -e 's/^/# /;' +# 60 between_scaffolds_na +# 20 centromere_na +# 21 short_arm_na +# 42 telomere_na +# 4 unknown_inferred_from_sequence +# 19 unknown_unspecified +# 5 within_scaffold_align_genus +# 36 within_scaffold_map +# 96 within_scaffold_paired-ends +# 44 within_scaffold_unspecified + + # how much defined by NCBI: + awk '{print $3-$2}' *.bed | ave stdin | grep -w total + # total 73600614.000000 + + # how much in the gap table: + hgsql -e 'select * from gap;' mm39 | awk '{print $4-$3}' \ + | ave stdin | grep -w total + # total 73598294.000000 + + # an extra 2320 bases marked in the gap file + # Compare to mm10: + hgsql -e 'select * from gap;' mm10 | awk '{print $4-$3}' \ + | ave stdin | sed -e 's/^/# /;' +# Q1 100.000000 +# median 838.000000 +# Q3 50000.000000 +# average 113665.609898 +# min 0.000000 +# max 2890000.000000 +# count 687 +# total 78088274.000000 +# standard deviation 485103.795880 + + hgsql -e 'select * from gap;' mm39 | awk '{print $4-$3}' \ + | ave stdin | sed -e 's/^/# /;' +# Q1 1357.000000 +# median 50000.000000 +# Q3 100000.000000 +# average 226456.289231 +# min 0.000000 +# max 2890000.000000 +# count 325 +# total 73598294.000000 +# standard deviation 687212.981441 + + +############################################################################## +# cpgIslands on UNMASKED sequence (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/cpgIslandsUnmasked + cd /hive/data/genomes/mm39/bed/cpgIslandsUnmasked + + time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -tableName=cpgIslandExtUnmasked \ + -maskedSeq=/hive/data/genomes/mm39/mm39.unmasked.2bit \ + -workhorse=hgwdev -smallClusterHub=ku mm39) > do.log 2>&1 + # real 3m30.591s + + cat fb.mm39.cpgIslandExtUnmasked.txt + # 56535294 bases of 2481941580 (2.278%) in intersection + +############################################################################# +# cytoBandIdeo - (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/cytoBand + cd /hive/data/genomes/mm39/bed/cytoBand + makeCytoBandIdeo.csh mm39 + +############################################################################# +# run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/idKeys + cd /hive/data/genomes/mm39/bed/idKeys + + time (doIdKeys.pl \ + -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit \ + -buildDir=`pwd` mm39) > do.log 2>&1 & +XXX - running - Mon Jul 27 15:15:44 PDT 2020 + # real 3m22.298s + + cat mm39.keySignature.txt + # 174191aae5515d1114a9d6320b152b1a + +############################################################################# +# gapOverlap (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/gapOverlap + cd /hive/data/genomes/mm39/bed/gapOverlap + time (doGapOverlap.pl \ + -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit mm39 ) \ + > do.log 2>&1 & +XXX - running - Mon Jul 27 15:15:36 PDT 2020 + # real 1m49.489s + + # there only only nine: + wc -l bed.tab + # 9 bed.tab + cut -f2- bed.tab +chr1 41008264 41010364 chr1:41008265-41010364 1000 + 41008264 41010364 0 2 1000,1000 0,1100 +chr17 58049274 58051374 chr17:58049275-58051374 1000 + 58049274 58051374 0 2 1000,1000 0,1100 +... etc ... +chrX 45160089 45162189 chrX:45160090-45162189 1000 + 45160089 45162189 0 2 1000,1000 0,1100 + + cat fb.mm39.gapOverlap.txt + # 16158 bases of 2482000080 (0.001%) in intersection + +############################################################################# +# tandemDups (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/tandemDups + cd /hive/data/genomes/mm39/bed/tandemDups + time (~/kent/src/hg/utils/automation/doTandemDup.pl \ + -twoBit=/hive/data/genomes/mm39/mm39.unmasked.2bit mm39) \ + > do.log 2>&1 & +XXX - running - Mon Jul 27 15:16:22 PDT 2020 + # real 188m34.598s + + cat fb.mm39.tandemDups.txt + # 155315479 bases of 3044872214 (5.101%) in intersection + + bigBedInfo mm39.tandemDups.bb | sed -e 's/^/# /;' +# version: 4 +# fieldCount: 13 +# hasHeaderExtension: yes +# isCompressed: yes +# isSwapped: 0 +# extraIndexCount: 0 +# itemCount: 2,822,307 +# primaryDataSize: 72,710,994 +# primaryIndexSize: 292,560 +# zoomLevels: 9 +# chromCount: 5335 +# basesCovered: 1,635,503,835 +# meanDepth (of bases covered): 14.396921 +# minDepth: 1.000000 +# maxDepth: 381.000000 +# std of depth: 29.341113 + +######################################################################### +# ucscToINSDC and ucscToRefSeq table/track (DONE - 2020-07-27 - Hiram) + # construct idKeys for the genbank sequence + mkdir /hive/data/genomes/mm39/genbank/idKeys + cd /hive/data/genomes/mm39/genbank/idKeys + faToTwoBit ../GCA_*m39_genomic.fna.gz mm39.genbank.2bit + + time (doIdKeys.pl -buildDir=`pwd` \ + -twoBit=`pwd`/mm39.genbank.2bit genbankMm39) > do.log 2>&1 & + # real 3m30.599s + + cat genbankMm39.keySignature.txt + # 174191aae5515d1114a9d6320b152b1a + + mkdir /hive/data/genomes/mm39/bed/chromAlias + cd /hive/data/genomes/mm39/bed/chromAlias + + join -t$'\t' ../idKeys/mm39.idKeys.txt \ + ../../genbank/idKeys/genbankMm39.idKeys.txt | cut -f2- \ + | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ + | sort -k1,1 -k2,2n > ucscToINSDC.bed + + # should be same line counts throughout: + wc -l * ../../chrom.sizes + # 2198 ucscToINSDC.bed + # 2198 ../../chrom.sizes + + export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` + echo $chrSize + # 23 + # use the $chrSize in this sed + sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | hgLoadSqlTab mm39 ucscToINSDC stdin ucscToINSDC.bed + + # should be quiet for all OK + checkTableCoords mm39 + + # should cover %100 entirely: + featureBits -countGaps mm39 ucscToINSDC + # 2482000080 bases of 2482000080 (100.000%) in intersection + +######################################################################### +# add chromAlias table (TBD - 2020-05-20 - Hiram) + + mkdir /hive/data/genomes/mm39/bed/chromAlias + cd /hive/data/genomes/mm39/bed/chromAlias + + hgsql -N -e 'select chrom,name from ucscToRefSeq;' mm39 \ + | sort -k1,1 > ucsc.refseq.tab + hgsql -N -e 'select chrom,name from ucscToINSDC;' mm39 \ + | sort -k1,1 > ucsc.genbank.tab + + wc -l *.tab + # 2198 ucsc.genbank.tab + + ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ + > mm39.chromAlias.tab + +for t in genbank +do + c0=`cat ucsc.$t.tab | wc -l` + c1=`grep $t mm39.chromAlias.tab | wc -l` + ok="OK" + if [ "$c0" -ne "$c1" ]; then + ok="ERROR" + fi + printf "# checking $t: $c0 =? $c1 $ok\n" +done +# checking genbank: 2198 =? 2198 OK + + # verify chrM is here properly: + grep chrM mm39.chromAlias.tab +# CM022001.1 chrM genbank + + hgLoadSqlTab mm39 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ + mm39.chromAlias.tab + +######################################################################### +# fixup search rule for assembly track/gold table (TBD - 2020-07-17 - Hiram) + cd ~/kent/src/hg/makeDb/trackDb/dog/mm39 + # preview prefixes and suffixes: + hgsql -N -e "select frag from gold;" mm39 \ + | sed -e 's/[0-9_.]\+//;' | sort | uniq -c + 1037 CM + 758 REHQ + + # implies a rule: '[CR][ME][HQ0-9]+(\.[0-9_]+)?' + + # verify this rule will find them all and eliminate them all: + hgsql -N -e "select frag from gold;" mm39 | wc -l + # 1795 + + hgsql -N -e "select frag from gold;" mm39 \ + | egrep -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l + # 1795 + + hgsql -N -e "select frag from gold;" mm39 \ + | egrep -v -e '[CR][ME][HQ0-9]+(\.[0-9_]+)?' | wc -l + # 0 + + # hence, add to trackDb/rhesus/mm39/trackDb.ra +searchTable gold +shortCircuit 1 +termRegex [CR][ME][HQ0-9]+(\.[0-9_]+)? +query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' +searchPriority 8 + + # verify searches work in the position box + + git commit -m 'adding search rule for gold/assembly track refs #22271' \ + trackDb.ra + +########################################################################## +# running repeat masker (DONE - 2020-07-27 - Hiram) + mkdir /hive/data/genomes/mm39/bed/repeatMasker + cd /hive/data/genomes/mm39/bed/repeatMasker + time (doRepeatMasker.pl -buildDir=`pwd` \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -smallClusterHub=ku mm39) > do.log 2>&1 +XXX - running - Mon Jul 27 14:48:56 PDT 2020 + # real 293m51.353s + + cat faSize.rmsk.txt +# 2482000080 bases (58500 N's 2481941580 real 1403544550 upper +# 1078397030 lower) in 2198 sequences in 1 files +# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1) +# max 124992030 (chrX) median 43246 +# %43.45 masked total, %43.45 masked real + + egrep -i "versi|relea" do.log +# RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ +# grep version of RepeatMasker$ /hive/data/staging/data/RepeatMasker/RepeatMasker +# February 01 2017 (open-4-0-8) 1.332 version of RepeatMasker +# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl +# CC Dfam_Consensus RELEASE 20181026; * +# CC RepBase RELEASE 20181026; * + + time featureBits -countGaps mm39 rmsk + # 1078398935 bases of 2482000080 (43.449%) in intersection + # real 0m35.578s + + # why is it different than the faSize above ? + # because rmsk masks out some N's as well as bases, the faSize count above + # separates out the N's from the bases, it doesn't show lower case N's + + # faster way to get the same result on high contig count assemblies: + time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' mm39 \ + | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" + # total 1078398935.000000 + # real 0m22.013s + +########################################################################## +# running simple repeat (DONE - 2020-07-27 - Hiram) + + mkdir /hive/data/genomes/mm39/bed/simpleRepeat + cd /hive/data/genomes/mm39/bed/simpleRepeat + time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ + -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ + -trf409=6 mm39) > do.log 2>&1 +XXX - running - Mon Jul 27 14:49:35 PDT 2020 + # real 7m53.400s + + cat fb.simpleRepeat + # 42156507 bases of 2337131234 (1.804%) in intersection + +XXX - ready for masking - 2020-07-17 + cd /hive/data/genomes/mm39 + # if using the Window Masker result: + cd /hive/data/genomes/mm39 +# twoBitMask bed/windowMasker/mm39.cleanWMSdust.2bit \ +# -add bed/simpleRepeat/trfMask.bed mm39.2bit + # you can safely ignore the warning about fields >= 13 + + # add to rmsk after it is done: + twoBitMask mm39.rmsk.2bit \ + -add bed/simpleRepeat/trfMask.bed mm39.2bit + # you can safely ignore the warning about fields >= 13 + twoBitToFa mm39.2bit stdout | faSize stdin > faSize.mm39.2bit.txt + cat faSize.mm39.2bit.txt +# 2482000080 bases (58500 N's 2481941580 real 1401386884 upper +# 1080554696 lower) in 2198 sequences in 1 files +# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1) +# max 124992030 (chrX) median 43246 +# %43.54 masked total, %43.54 masked real + + rm /gbdb/mm39/mm39.2bit + ln -s `pwd`/mm39.2bit /gbdb/mm39/mm39.2bit + +######################################################################### +# CREATE MICROSAT TRACK (TBD - 2020-03-31 - Hiram) + ssh hgwdev + mkdir /cluster/data/mm39/bed/microsat + cd /cluster/data/mm39/bed/microsat + + awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ + ../simpleRepeat/simpleRepeat.bed > microsat.bed + + hgLoadBed mm39 microsat microsat.bed + # Read 65981 elements of size 4 from microsat.bed + +########################################################################## +## WINDOWMASKER (TBD - 2020-03-31 - Hiram) + + mkdir /hive/data/genomes/mm39/bed/windowMasker + cd /hive/data/genomes/mm39/bed/windowMasker + time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ + -dbHost=hgwdev mm39) > do.log 2>&1 + # real 90m16.169s + + # Masking statistics + cat faSize.mm39.cleanWMSdust.txt +# 2482000080 bases (58500 N's 2481941580 real 1630728232 upper 851213348 lower) +# in 2198 sequences in 1 files +# Total size: mean 1129208.4 sd 8542765.0 min 13084 (chrUn_JAAHUQ010000994v1) +# max 124992030 (chrX) median 43246 +# %34.30 masked total, %34.30 masked real + + cat fb.mm39.rmsk.windowmaskerSdust.txt + # 598271411 bases of 2482000080 (24.104%) in intersection + +########################################################################## +# cpgIslands - (TBD - 2020-04-02 - Hiram) + mkdir /hive/data/genomes/mm39/bed/cpgIslands + cd /hive/data/genomes/mm39/bed/cpgIslands + time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ + -workhorse=hgwdev -smallClusterHub=ku mm39) > do.log 2>&1 + # real 3m29.034s + + cat fb.mm39.cpgIslandExt.txt + # 47618882 bases of 2481941580 (1.919%) in intersection + +############################################################################## +# genscan - (TBD - 2020-04-02 - Hiram) + mkdir /hive/data/genomes/mm39/bed/genscan + cd /hive/data/genomes/mm39/bed/genscan + time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ + -bigClusterHub=ku mm39) > do.log 2>&1 + # real 8m19.775s + + # two jobs broken: +./runGsBig2M.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed & +./runGsBig2M.csh chr34 000 gtf/000/chr34.gtf pep/000/chr34.pep subopt/000/chr34.bed +wait + # real 14m27.845s + + time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ + -continue=makeBed -bigClusterHub=ku mm39) > makeBed.log 2>&1 + # real 0m45.365s + + cat fb.mm39.genscan.txt + # 57650331 bases of 2481941580 (2.323%) in intersection + + cat fb.mm39.genscanSubopt.txt + # 50129491 bases of 2481941580 (2.020%) in intersection + +######################################################################### +# Create kluster run files (TBD - 2020-04-02 - Hiram) + + # numerator is mm39 gapless bases "real" as reported by: + featureBits -noRandom -noHap mm39 gap + # 36700 bases of 2353522726 (0.002%) in intersection + # ^^^ + + # denominator is hg19 gapless bases as reported by: + # featureBits -noRandom -noHap hg19 gap + # 234344806 bases of 2861349177 (8.190%) in intersection + # 1024 is threshold used for human -repMatch: + calc \( 2353522726 / 2861349177 \) \* 1024 + # ( 2353522726 / 2861349177 ) * 1024 = 842.262556 + + # ==> use -repMatch=800 according to size scaled down from 1024 for human. + # and rounded down to nearest 50 + cd /hive/data/genomes/mm39 + time blat mm39.2bit \ + /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/mm39.11.ooc \ + -repMatch=800 + # Wrote 34718 overused 11-mers to jkStuff/mm39.11.ooc + # real 0m21.985s + + # canFam3 at repMatch=900: + # Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc + # real 1m11.629s + + # there are no non-bridged gaps + hgsql -N \ + -e 'select * from gap where bridge="no" order by size;' mm39 \ + + # HOWEVER, every gap in this assembly is the same 'within scaffold' + # at size 100: + hgsql -N -e 'select size from gap where bridge="yes" order by size;' + mm39 | sort | uniq -c + # 585 100 + + # using these gaps to make a lift file + # minimum gap size is 100 and produces a reasonable number of lifts + gapToLift -verbose=2 -minGap=100 mm39 jkStuff/mm39.nonBridged.lft \ + -bedFile=jkStuff/mm39.nonBridged.bed + wc -l jkStuff/mm39.nonBri* + # 2198 jkStuff/mm39.nonBridged.bed + # 2198 jkStuff/mm39.nonBridged.lft + +######################################################################## +# lastz/chain/net swap human/hg38 (TBD - 2020-04-10 - Hiram) + + # original alignment + cd /hive/data/genomes/hg38/bed/lastzMm39.2020-04-02 + + cat fb.hg38.chainMm39Link.txt + # 1549397508 bases of 3110768607 (49.808%) in intersection + cat fb.hg38.chainSynMm39Link.txt + # 1488468205 bases of 3110768607 (47.849%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ + hg38 mm39) > rbest.log 2>&1 & + # real 310m32.196s + + cat fb.hg38.chainRBest.Mm39.txt + # 1425406620 bases of 3110768607 (45.822%) in intersection + + # and for the swap: + mkdir /hive/data/genomes/mm39/bed/blastz.hg38.swap + cd /hive/data/genomes/mm39/bed/blastz.hg38.swap + + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzMm39.2020-04-02/DEF \ + -swap -chainMinScore=3000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 + # real 99m10.990s + + cat fb.mm39.chainHg38Link.txt + # 1493209286 bases of 2481941580 (60.163%) in intersection + cat fb.mm39.chainSynHg38Link.txt + # 1448164376 bases of 2481941580 (58.348%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ + mm39 hg38) > rbest.log 2>&1 & + # real 257m59.713s + + cat fb.mm39.chainRBest.Hg38.txt + # 1425296830 bases of 2481941580 (57.427%) in intersection + +########################################################################### +# lastz/chain/net swap mouse/mm10 (TBD - 2020-04-20 - Hiram) + + # original alignment + cat fb.mm10.chainMm39Link.txt + # 777883731 bases of 2652783500 (29.323%) in intersection + cat fb.mm10.chainSynMm39Link.txt + # 736602602 bases of 2652783500 (27.767%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev mm10 mm39 \ + -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & + # real 219m16.168s + + cat fb.mm10.chainRBest.Mm39.txt + # 741307883 bases of 2652783500 (27.945%) in intersection + + mkdir /hive/data/genomes/mm39/bed/blastz.mm10.swap + cd /hive/data/genomes/mm39/bed/blastz.mm10.swap + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/mm10/bed/lastzMm39.2020-04-02/DEF \ + -swap -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & + # real 50m20.639s + + cat fb.mm39.chainMm10Link.txt + # 772902855 bases of 2481941580 (31.141%) in intersection + cat fb.mm39.chainSynMm10Link.txt + # 737924732 bases of 2481941580 (29.732%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev mm39 mm10 \ + -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & + # real 173m38.016s + + cat fb.mm39.chainRBest.Mm10.txt + # 740357755 bases of 2481941580 (29.830%) in intersection + +############################################################################## +# GENBANK AUTO UPDATE (TBD - 2020-04-09 - Hiram) + ssh hgwdev + cd $HOME/kent/src/hg/makeDb/genbank + git pull + # /cluster/data/genbank/data/organism.lst shows: + # organism mrnaCnt estCnt refSeqCnt + # Canis latrans 2 0 0 + # Canis lupus 36 0 0 + # Canis lupus familiaris 3351 382644 1718 + # Canis lupus laniger 2 0 0 + # Canis lupus lupus 2 0 0 + # Canis mesomelas 1 0 0 + # Canis sp. 45 0 0 + + # the latrans is the Coyota, the mesomelas + # is the Black-backed jackal from Africa and the langier is the Tibetan wolf + # lupus lupus is the Eurasian wolf + + # edit etc/genbank.conf to add mm39 just after canFam3 + +# mm39 (German shepard - GCA_011100685.1 - UU_Cfam_GSD_1.0) +mm39.serverGenome = /hive/data/genomes/mm39/mm39.2bit +mm39.ooc = /hive/data/genomes/mm39/jkStuff/mm39.11.ooc +mm39.lift = /hive/data/genomes/mm39/jkStuff/mm39.nonBridged.lft +mm39.align.unplacedChroms = chrUn_* +mm39.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} +mm39.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} +mm39.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} +mm39.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} +mm39.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} +mm39.refseq.mrna.native.load = yes +mm39.refseq.mrna.xeno.load = yes +# DO NOT NEED genbank.mrna.xeno except for human, mouse +mm39.genbank.mrna.xeno.load = yes +mm39.downloadDir = mm39 +mm39.upstreamGeneTbl = refGene +mm39.perChromTables = no + + # verify the files specified exist before checking in the file: + grep ^mm39 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og +# -rw-rw-r-- 1 651703337 Apr 2 08:57 /hive/data/genomes/mm39/mm39.2bit +# -rw-rw-r-- 1 138880 Apr 2 09:51 /hive/data/genomes/mm39/jkStuff/mm39.11.ooc +# -rw-rw-r-- 1 139818 Apr 2 09:56 /hive/data/genomes/mm39/jkStuff/mm39.nonBridged.lft + + git commit -m "Added mm39 dog; refs #22271" etc/genbank.conf + git push + + # update /cluster/data/genbank/: + make etc-update + + # enable daily alignment and update of hgwdev + cd ~/kent/src/hg/makeDb/genbank + git pull + # add mm39 to: + # etc/hgwdev.dbs etc/align.dbs + git commit -m "Added mm39 - dog refs #22271" etc/hgwdev.dbs etc/align.dbs + git push + make etc-update + + # wait a few days for genbank magic to take place, the tracks will + # appear + +############################################################################# +# augustus gene track (TBD - 2020-04-10 - Hiram) + + mkdir /hive/data/genomes/mm39/bed/augustus + cd /hive/data/genomes/mm39/bed/augustus + time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ + -species=human -dbHost=hgwdev \ + -workhorse=hgwdev mm39) > do.log 2>&1 + # real 74m39.734s + + cat fb.mm39.augustusGene.txt + # 49999966 bases of 2481941580 (2.015%) in intersection + +######################################################################### +# ncbiRefSeq (TBD - 2019-11-20 - Hiram) + ### XXX ### Not available on GCA/genbank assemblies + + mkdir /hive/data/genomes/mm39/bed/ncbiRefSeq + cd /hive/data/genomes/mm39/bed/ncbiRefSeq + # running step wise just to be careful + time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ + -bigClusterHub=ku -dbHost=hgwdev \ + -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ + refseq vertebrate_mammalian Gorilla_gorilla \ + GCA_008122165.1_Kamilah_GGO_v0 mm39) > download.log 2>&1 + # real 1m37.523s + + time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ + -continue=process -bigClusterHub=ku -dbHost=hgwdev \ + -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ + refseq vertebrate_mammalian Gorilla_gorilla \ + GCF_008122165.1_Kamilah_GGO_v0 mm39) > process.log 2>&1 + # real 2m9.450s + + time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ + -continue=load -bigClusterHub=ku -dbHost=hgwdev \ + -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ + refseq vertebrate_mammalian Gorilla_gorilla \ + GCF_008122165.1_Kamilah_GGO_v0 mm39) > load.log 2>&1 + # real 0m21.982s + + cat fb.ncbiRefSeq.mm39.txt + # 74279781 bases of 2999027915 (2.477%) in intersection + + # add: include ../../refSeqComposite.ra alpha + # to the gorilla/mm39/trackDb.ra to turn on the track in the browser + + # XXX 2019-11-20 - ready for this after genbank runs + + featureBits -enrichment mm39 refGene ncbiRefSeq + # refGene 0.402%, ncbiRefSeq 3.148%, both 0.402%, cover 99.90%, enrich 31.73x + featureBits -enrichment mm39 ncbiRefSeq refGene + # ncbiRefSeq 3.148%, refGene 0.402%, both 0.402%, cover 12.76%, enrich 31.73x + + featureBits -enrichment mm39 ncbiRefSeqCurated refGene + # ncbiRefSeqCurated 0.401%, refGene 0.402%, both 0.400%, cover 99.66%, enrich 247.79x + + featureBits -enrichment mm39 refGene ncbiRefSeqCurated + # refGene 0.402%, ncbiRefSeqCurated 0.401%, both 0.400%, cover 99.33%, enrich 247.79x + +######################################################################### +# LIFTOVER TO canFam3 (TBD - 2020-04-02 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/mm39/bed/blat.canFam3.2020-04-02 + cd /hive/data/genomes/mm39/bed/blat.canFam3.2020-04-02 + doSameSpeciesLiftOver.pl -verbose=2 \ + -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/mm39/jkStuff/mm39.11.ooc \ + mm39 canFam3 + time (doSameSpeciesLiftOver.pl -verbose=2 \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/mm39/jkStuff/mm39.11.ooc \ + mm39 canFam3) > doLiftOverToCanFam3.log 2>&1 + # real 1100m17.743s + + # see if the liftOver menus function in the browser from mm39 to canFam3 + +######################################################################### +# BLATSERVERS ENTRY (TBD - 2020-04-02 - Hiram) +# After getting a blat server assigned by the Blat Server Gods, + ssh hgwdev + + hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("mm39", "blat1b", "17904", "1", "0"); \ + INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("mm39", "blat1b", "17905", "0", "1");' \ + hgcentraltest + # test it with some sequence + +############################################################################ +## reset default position to gene: CDH2 upon recommendation from Kerstin +## (TBD - 2020-06-22 - Hiram) + + ssh hgwdev + hgsql -e 'update dbDb set defaultPos="chr7:60683331-61003907" + where name="mm39";' hgcentraltest + +############################################################################## +# crispr whole genome (TBD - 2020-04-09 - Hiram) + mkdir /hive/data/genomes/mm39/bed/crisprAll + cd /hive/data/genomes/mm39/bed/crisprAll + + # the large shoulder argument will cause the entire genome to be scanned + # this takes a while for a new genome to get the bwa indexing done + time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \ + mm39 genscan -shoulder=250000000 -tableName=crisprAll \ + -fileServer=hgwdev \ + -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ + -workhorse=hgwdev) > ranges.log 2>&1 + # real 1m16.539s + + time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ + -continue=guides -stop=specScores mm39 genscan \ + -shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \ + -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ + -workhorse=hgwdev) > specScores.log 2>&1 + # real 6558m26.295s + + cat guides/run.time | sed -e 's/^/# /;' +# Completed: 100 of 100 jobs +# CPU time in finished jobs: 11979s 199.66m 3.33h 0.14d 0.000 y +# IO & Wait Time: 251s 4.18m 0.07h 0.00d 0.000 y +# Average job time: 122s 2.04m 0.03h 0.00d +# Longest finished job: 289s 4.82m 0.08h 0.00d +# Submission to last job: 303s 5.05m 0.08h 0.00d + + cat specScores/run.time | sed -e 's/^/# /;' +# Completed: 3096565 of 3096565 jobs +# CPU time in finished jobs: 263946983s 4399116.38m 73318.61h 3054.94d 8.370 y +# IO & Wait Time: 17766691s 296111.52m 4935.19h 205.63d 0.563 y +# Average job time: 91s 1.52m 0.03h 0.00d +# Longest finished job: 851s 14.18m 0.24h 0.01d +# Submission to last job: 324649s 5410.82m 90.18h 3.76d + +# # Number of specScores: 233102255 + + ### remember to get back to hgwdev to run this + time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ + -continue=effScores -stop=load mm39 genscan \ + -shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \ + -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ + -workhorse=hgwdev) > load.log 2>&1 + # real 932m13.229s + + cat effScores/run.time | sed -e 's/^/# /;' +# Completed: 25662 of 25662 jobs +# CPU time in finished jobs: 12763858s 212730.96m 3545.52h 147.73d 0.405 y +# IO & Wait Time: 144123s 2402.05m 40.03h 1.67d 0.005 y +# Average job time: 503s 8.38m 0.14h 0.01d +# Longest finished job: 4091s 68.18m 1.14h 0.05d +# Submission to last job: 15067s 251.12m 4.19h 0.17d + + cat offTargets/run.time | sed -e 's/^/# /;' +# Completed: 154829 of 154829 jobs +# CPU time in finished jobs: 1805712s 30095.20m 501.59h 20.90d 0.057 y +# IO & Wait Time: 3128264s 52137.73m 868.96h 36.21d 0.099 y +# Average job time: 32s 0.53m 0.01h 0.00d +# Longest finished job: 273s 4.55m 0.08h 0.00d +# Submission to last job: 5337s 88.95m 1.48h 0.06d + +######################################################################### +# all.joiner update, downloads and in pushQ - (WORKING - 2019-11-20 - Hiram) + cd $HOME/kent/src/hg/makeDb/schema + # verify all the business is done for release + ~/kent/src/hg/utils/automation/verifyBrowser.pl mm39 +# 66 tables in database mm39 - Dog, Canis lupus familiaris +# verified 55 tables in database mm39, 11 extra tables, 14 optional tables +# chainNetRBestHg38 3 optional tables +# chainNetRBestMm10 3 optional tables +# chainNetSynHg38 3 optional tables +# chainNetSynMm10 3 optional tables +# gapOverlap 1 optional tables +# tandemDups 1 optional tables +# 1 chainCanFam3 - extra table +# 2 chainCanFam3Link - extra table +# 3 chainRBestCanFam3 - extra table +# 4 chainRBestCanFam3Link - extra table +# . . . etc . . . +# 8 crisprAllTargets - extra table +# 9 netCanFam3 - extra table +# 10 netRBestCanFam3 - extra table +# 11 netSynCanFam3 - extra table +# 13 genbank tables found +# verified 28 required tables, 1 missing tables +# 1 ucscToRefSeq - missing table +# hg38 chainNet to mm39 found 3 required tables +# mm10 chainNet to mm39 found 3 required tables +# hg38 chainNet RBest and syntenic to mm39 found 6 optional tables +# mm10 chainNet RBest and syntenic to mm39 found 3 optional tables +# liftOver to previous versions: 1, from previous versions: 1 + + # fixup all.joiner until this is a clean output + joinerCheck -database=mm39 -tableCoverage all.joiner + joinerCheck -database=mm39 -times all.joiner + joinerCheck -database=mm39 -keys all.joiner + + # when clean, check in: + git commit -m 'adding rules for mm39 refs #22271' all.joiner + git push + # run up a 'make alpha' in hg/hgTables to get this all.joiner file + # into the hgwdev/genome-test system + + cd /hive/data/genomes/mm39 + time (makeDownloads.pl mm39) > downloads.log 2>&1 + # real 16m11.233s + + # now ready for pushQ entry + mkdir /hive/data/genomes/mm39/pushQ + cd /hive/data/genomes/mm39/pushQ + time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList mm39) > mm39.pushQ.sql 2> stderr.out + # real 15m2.385s +XXXX + + # remove the tandemDups and gapOverlap from the file list: + sed -i -e "/tandemDups/d" redmine.mm39.table.list + sed -i -e "/Tandem Dups/d" redmine.mm39.releaseLog.txt + sed -i -e "/gapOverlap/d" redmine.mm39.table.list + sed -i -e "/Gap Overlaps/d" redmine.mm39.releaseLog.txt + + # check for errors in stderr.out, some are OK, e.g.: + # WARNING: mm39 does not have ucscToRefSeq + # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqVersion.txt + # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.bb + # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.ix + # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/ncbiRefSeqOther.ixx + # WARNING: hgwdev does not have /gbdb/mm39/ncbiRefSeq/seqNcbiRefSeq.rna.fa + # WARNING: mm39 does not have seq + # WARNING: mm39 does not have extFile + + # verify the file list does correctly match to files + cat redmine.mm39.file.list | while read L +do + eval ls $L > /dev/null +done + # should be silent, missing files will show as errors + + # verify database tables, how many to expect: + wc -l redmine.mm39.table.list + # 52 redmine.mm39.table.list + + # how many actual: + awk -F'.' '{printf "hgsql -N %s -e '"'"'show table status like \"%s\";'"'"'\n", $1, $2}' redmine.mm39.table.list | sh | wc -l + # 52 + + # would be a smaller number actual if some were missing + + # add the path names to the listing files in the redmine issue + # in the three appropriate entry boxes: + +# /hive/data/genomes/mm39/pushQ/redmine.mm39.file.list +# /hive/data/genomes/mm39/pushQ/redmine.mm39.releaseLog.txt +# /hive/data/genomes/mm39/pushQ/redmine.mm39.table.list + +#########################################################################