fa35e265cbbfb28dc28b754cdfd4e79c05b42970 galt Fri Oct 14 19:49:24 2022 -0700 Finished genome build of database that has ONLY new fix and patch chromosomes. The first part of the hg38 patch14 build. diff --git src/hg/makeDb/doc/hg38/grcH38P14.txt src/hg/makeDb/doc/hg38/grcH38P14.txt new file mode 100644 index 0000000..c744bad --- /dev/null +++ src/hg/makeDb/doc/hg38/grcH38P14.txt @@ -0,0 +1,445 @@ +# for emacs: -*- mode: sh; -*- + +############################################################################## +# GRCh38 patch 14 build: build basic tracks on separate database for new +# sequences only (relative to hg38). +############################################################################## + +# run this first, then it continues the patch in patchUpdate.14.txt + +############################################################################## +# download or rather ln -s the patch release files (DONE - 2022-09-13 - Galt) + + # Note: newer assemblies use refseq releases instead of genbank, but hg38 uses genbank + # so continue with that when building patches. + mkdir -p /hive/data/genomes/grcH38P14/genbank + cd /hive/data/genomes/grcH38P14/genbank + + # Releases have already been downloaded to /hive/data/outside/ncbi/genomes/. + ln -s /hive/data/outside/ncbi/genomes/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/* . + + +############################################################################## +# Set up fasta and agp with UCSC names (DONE - 2022-09-23 - Galt) + + mkdir /hive/data/genomes/grcH38P14/ucsc + cd /hive/data/genomes/grcH38P14/ucsc + + # identify sequences not in existing genome db + faCount ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz \ + > faCount.GRCh38.p14.txt + ~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl \ + /hive/data/genomes/hg38/chrom.sizes \ + faCount.GRCh38.p14.txt ../genbank/GCA_000001405.29_GRCh38.p14_assembly_report.txt \ + | grep -w new > new.sequences.list + wc -l new.sequences.list +#71 new.sequences.list + + # Extract UCSC-named FASTA for the new sequences + cut -f3 new.sequences.list > extract.new.list + awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed + faSomeRecords ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz extract.new.list stdout \ + | sed -e 's/ .*//;' \ + | sed -f genbankToUCSC.sed \ + | gzip -c > grcH38P14.fa.gz + faSize grcH38P14.fa.gz +#27093089 bases (242788 N's 26850301 real 17087494 upper 9762807 lower) in 71 sequences in 1 files +#Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 (chr5_MU273354v1_fix) median 301310 + + # Compare faSize results for whole GCA_000001405.29_GRCh38.p14_genomic.fna.gz + # vs. concatenation of hg38 fasta and grcH38P14.fa.gz: + faSize ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz +#3298912062 bases (161611482 N's 3137300580 real 1945815910 upper 1191484670 lower) in 709 sequences in 1 files +#Total size: mean 4652908.4 sd 25469887.9 min 970 (KI270394.1) max 248956422 (CM000663.2) median 171027 + + + twoBitToFa /hive/data/genomes/hg38/hg38.2bit stdout \ + | faSize grcH38P14.fa.gz stdin +#3299210039 bases (161611482 N's 3137598557 real 1506729106 upper 1630869451 lower) in 711 sequences in 2 files +#Total size: mean 4640239.2 sd 25435109.8 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 171027 + +NOTE that 709 != 711. 711 - 709 == 2, so our counts are off by 2. + +The reason that two fixes went missing is because they got a new subversion, and should apparently replace +the original version. Two of the fix patches has changed from version1 to version2, +and they have removed obsolete version1 from the genbank release. + +There are some notes in /hive/data/genomes/grcH38P14/ucsc/galt.NOTES + +# in p13 and older releases +chr11_KQ759759v1_fix +chr22_KQ759762v1_fix + +So, these just became updated as v2 in the new assembly? +HG1311_HG2539_PATCH fix-patch 22 Chromosome KQ759762.2 <> na PATCHES 101040 na +HG107_HG2565_PATCH fix-patch 11 Chromosome KQ759759.2 <> na PATCHES 204999 na + +and in the new one we have: +chr11_KQ759759v2_fix +chr22_KQ759762v2_fix + +[galt@hgwdev ucsc]$ egrep '(KQ759759|KQ759762)' new.sequences.list +chr11_KQ759759v2_fix 204999 KQ759759.2 new +chr22_KQ759762v2_fix 101040 KQ759762.2 new + +Since the removed sequences are in the main hg38.2bit, but not in the new patch14 sequences, +we can just continue the GRCh38 patch 14 build which is not affected. +We might delete those 2 with old versions in patchUpdate.14.txt which is run after this build doc. + + # Good, everything in GCA_000001405.29_GRCh38.p14_genomic.fna.gz is accounted for + # between hg38.2bit and grcH38P14.fa.gz + + # Make UCSC-named AGP: + zcat ../genbank/GCA_000001405.29_GRCh38.p14_assembly_structure/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \ + | grep -Fwf extract.new.list \ + | sed -f genbankToUCSC.sed > grcH38P14.agp + + # construct 2bit file: + cd /hive/data/genomes/grcH38P14 + faToTwoBit ucsc/grcH38P14.fa.gz grcH38P14.unmasked.2bit + twoBitInfo grcH38P14.unmasked.2bit stdout | sort -k2nr > chrom.sizes + # take a look at chrom.sizes to verify it looks OK. + + # Make sure AGP and FASTA/2bit agree: + checkAgpAndFa ucsc/grcH38P14.agp grcH38P14.unmasked.2bit | tail -1 +#All AGP and FASTA entries agree - both files are valid + + +############################################################################## +# establish config.ra file (DONE - Galt - 2021-09-23) + # arguments here are: + cd /hive/data/genomes/grcH38P14 + # Must make photoReference.txt first -- copy from hg38 + cp /hive/data/genomes/hg38/photoReference.txt . + $HOME/kent/src/hg/utils/automation/prepConfig.pl grcH38P14 haplotypes \ + GRCh38.p14 genbank/*_assembly_report.txt > grcH38P14.config.ra + # Edit grcH38P14.config.ra to avoid confusion with actual hg38 +assemblyDate Feb. 2019 p14 +orderKey 2001 + +# NOTE FROM GALT, a way to record it in the makedoc with comments so we know what values were used? + sed -e 's/^/#/' grcH38P14.config.ra +## config parameters for makeGenomeDb.pl: +#db grcH38P14 +#clade haplotypes +#scientificName Homo sapiens +#commonName Human +#assemblyDate Feb. 2019 p14 +#assemblyLabel Genome Reference Consortium +#assemblyShortLabel GRCh38.p14 +#orderKey 2001 +## mitochondrial sequence included in refseq release +## mitoAcc J01415.2 +#mitoAcc none +#fastaFiles /hive/data/genomes/grcH38P14/ucsc/*.fa.gz +#agpFiles /hive/data/genomes/grcH38P14/ucsc/*.agp +## qualFiles none +#dbDbSpeciesDir GRCh38.p14 +#photoCreditURL http://www.cbse.ucsc.edu/ +#photoCreditName Graphic courtesy of CBSE +#ncbiGenomeId 51 +#ncbiAssemblyId 11968211 +#ncbiAssemblyName GRCh38.p14 +#ncbiBioProject 31257 +#ncbiBioSample notFound +#genBankAccessionID GCA_000001405.29 +#taxId 9606 + + +############################################################################## +# Initial database build (DONE - 2022-09-23 - Galt) + + cd /hive/data/genomes/grcH38P14 + # AGP and unmasked.2bit are already built and checked, so start at the db step: + mkdir jkStuff + $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl grcH38P14.config.ra -debug +#HgStepManager: executing from step 'seq' through step 'trackDb'. +#HgStepManager: executing step 'seq' Fri Sep 23 16:26:50 2022. +#seq: looks like this was run successfully already (/cluster/data/grcH38P14/chrom.sizes exists). Either run with -continue agp or some later step, or move aside/remove /cluster/data/grcH38P14/chrom.sizes and run again. + + + # Make chromInfo.tab. + mkdir -p bed/chromInfo + awk '{print $1 "\t" $2 "\t/gbdb/grcH38P14/grcH38P14.2bit";}' chrom.sizes \ + > bed/chromInfo/chromInfo.tab + # Make a link to the .agp file where makeGenomeDb.pl expects to find it. + ln -s ucsc/grcH38P14.agp . + + # Skip 'seq' and 'agp' steps because those were done above. + time ($HOME/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -continue=db grcH38P14.config.ra) > db.log 2>&1 & + tail -f db.log +#real 0m22.790s + + # actually, I had to add uniprot.ra to git archive step list in $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl + # then I restarted it at the dbDb step. + + # Ignore all the "NOTES -- STUFF THAT YOU WILL HAVE TO DO --" stuff because this is + # going to be folded into hg38. + # Now the gold, gap and gc5BaseBw tracks are built. + +############################################################################# +# RepeatMasker (DONE - 2022-09-23 - Galt) + mkdir /hive/data/genomes/grcH38P14/bed/repeatMasker + cd /hive/data/genomes/grcH38P14/bed/repeatMasker + time ($HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -smallClusterHub=ku grcH38P14) > do.log 2>&1 & + tail -f do.log +#real 49m1.583s + + egrep "bases|Total|masked" faSize.rmsk.txt \ + | fold -w 95 -s | sed -e 's/^/# /;' +# 27093089 bases (242788 N's 26850301 real 13541432 upper 13308869 lower) in 71 sequences in 1 +# files +# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 +# (chr5_MU273354v1_fix) median 301310 +# %49.12 masked total, %49.57 masked real + + + egrep -i "versi|relea" do.log | sed -e 's/^/# /;' +# VERSION:\n +# /cluster/software/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`. +# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl +# RepeatMasker version 4.1.2-p1 +# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl +# CC Artefacts RELEASE 20190301; * +# CC Dfam RELEASE Dfam_3.1; + + featureBits -countGaps grcH38P14 rmsk +#13308822 bases of 27093089 (49.123%) in intersection + + +########################################################################## +# running simple repeat (DONE - 2022-09-23 - Galt) + + mkdir /hive/data/genomes/grcH38P14/bed/simpleRepeat + cd /hive/data/genomes/grcH38P14/bed/simpleRepeat + # using trf409 6 here like hg38 + time ($HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl -buildDir=`pwd` \ + -dbHost=hgwdev -workhorse=hgwdev -bigClusterHub=ku -smallClusterHub=ku \ + -trf409 6 grcH38P14) > do.log 2>&1 & +#real 1m37.386s + + cat fb.simpleRepeat +#942219 bases of 26850301 (3.509%) in intersection + + # adding this trfMask to .rmsk.2bit + cd /hive/data/genomes/grcH38P14 + twoBitMask grcH38P14.rmsk.2bit \ + -add bed/simpleRepeat/trfMask.bed grcH38P14.2bit + # you can safely ignore the warning about fields >= 13 + + twoBitToFa grcH38P14.2bit stdout | faSize stdin > faSize.grcH38P14.2bit.txt + egrep "bases|Total|masked" faSize.grcH38P14.2bit.txt \ + | fold -w 95 -s | sed -e 's/^/# /;' +# 27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1 +# files +# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 +# (chr5_MU273354v1_fix) median 301310 +# %49.24 masked total, %49.68 masked real + + + # reset the symlink + ln -sf `pwd`/grcH38P14.2bit /gbdb/grcH38P14/grcH38P14.2bit + + +########################################################################## +## WINDOWMASKER (DONE - 2022-09-26 - Galt) + + mkdir /hive/data/genomes/grcH38P14/bed/windowMasker + cd /hive/data/genomes/grcH38P14/bed/windowMasker + time ($HOME/kent/src/hg/utils/automation/doWindowMasker.pl -buildDir=`pwd` \ + -workhorse=hgwdev -dbHost=hgwdev grcH38P14) > do.log 2>&1 +# *** All done ! - Elapsed time: 0m47s + + featureBits -countGaps grcH38P14 rmsk windowmaskerSdust \ + > fb.grcH38P14.rmsk.windowmaskerSdust.txt 2>&1 + cat fb.grcH38P14.rmsk.windowmaskerSdust.txt +#5549307 bases of 27093089 (20.482%) in intersection + + # Masking statistics + egrep "bases|Total|masked" faSize.grcH38P14.cleanWMSdust.txt \ + | fold -w 95 -s | sed -e 's/^/# /;' +# 27093089 bases (242788 N's 26850301 real 19806463 upper 7043838 lower) in 71 sequences in 1 +# files +# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 +# (chr5_MU273354v1_fix) median 301310 +# %26.00 masked total, %26.23 masked real + +############################################################################# +# cytoBandIdeo - (DONE - 2022-09-26 - Galt) + mkdir /hive/data/genomes/grcH38P14/bed/cytoBand + cd /hive/data/genomes/grcH38P14/bed/cytoBand + makeCytoBandIdeo.csh grcH38P14 + + +############################################################################# +# cpgIslands - (DONE - 2022-09-26 - Galt) + mkdir /hive/data/genomes/grcH38P14/bed/cpgIslands + cd /hive/data/genomes/grcH38P14/bed/cpgIslands + time ($HOME/kent/src/hg/utils/automation/doCpgIslands.pl -dbHost=hgwdev \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku grcH38P14) > do.log 2>&1 & +# *** All done ! Elapsed time: 1m10s +# *** All done ! Elapsed time: 0m19s + + cat fb.grcH38P14.cpgIslandExt.txt +#394646 bases of 26850301 (1.470%) in intersection + + +############################################################################## +# genscan - (DONE - 2022-09-26 - Galt) + mkdir /hive/data/genomes/grcH38P14/bed/genscan + cd /hive/data/genomes/grcH38P14/bed/genscan + time ($HOME/kent/src/hg/utils/automation/doGenscan.pl -buildDir=`pwd` \ + -workhorse=hgwdev -dbHost=hgwdev -bigClusterHub=ku grcH38P14) > do.log 2>&1 & +# *** All done ! Elapsed time: 2m44s + + cat fb.grcH38P14.genscan.txt +#1046433 bases of 26850301 (3.897%) in intersection + + cat fb.grcH38P14.genscanSubopt.txt +#844615 bases of 26850301 (3.146%) in intersection + + +############################################################################# +# augustus gene track (DONE - 2022-09-26 - Galt) + + mkdir /hive/data/genomes/grcH38P14/bed/augustus + cd /hive/data/genomes/grcH38P14/bed/augustus + time ($HOME/kent/src/hg/utils/automation/doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ + -species=human -dbHost=hgwdev -workhorse=hgwdev grcH38P14) > do.log 2>&1 & +# *** All done ! Elapsed time: 11m42s + + cat fb.grcH38P14.augustusGene.txt +#972943 bases of 26850301 (3.624%) in intersection + +############################################################################# + +# make the tracks in your browser using the temporary checkout: + +pushd /hive/data/genomes/grcH38P14/TemporaryTrackDbCheckout/kent/src/hg/src/makeDb/trackDb/ +./loadTracks trackDb_\${USER} hgFindSpec_\${USER} grcH38P14 +popd + +# check out your browser +https://hgwdev-${USER}.gi.ucsc.edu/cgi-bin/hgTracks?db=grcH38P14 + +############################################################################## +# Download files (DONE - 2022-09-26 - Galt) + cd /hive/data/genomes/grcH38P14 + time ($HOME/kent/src/hg/utils/automation/makeDownloads.pl -noChromFiles \ + -workhorse=hgwdev grcH38P14) > downloads.log 2>&1 & +# *** All done! +#real 0m24.688s + + +############################################################################## +# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2022-10-14 - Galt) + mkdir /hive/data/genomes/grcH38P14/bed/linSpecRep + cd /hive/data/genomes/grcH38P14/bed/linSpecRep + # create individual .out files from the master record in ../repeatMasker + # cluster job is perhaps overkill for just the patches, but it works + mkdir splitOut + cat <<'_EOF_' > split.csh +#!/bin/csh -fe +set C = $1 +head -3 ../repeatMasker/grcH38P14.sorted.fa.out > splitOut/${C}.out +grep "${C} " ../repeatMasker/grcH38P14.sorted.fa.out >> splitOut/${C}.out +_EOF_ + chmod +x split.csh + + cat << '_EOF_' > template +#LOOP +split.csh $(root1) {check out line+ splitOut/$(root1).out} +#ENDLOOP +_EOF_ + # small ones first: + cut -f1 ../../chrom.sizes | tac > chrom.list + gensub2 chrom.list single template jobList + para make jobList + para time +#Completed: 71 of 71 jobs +#CPU time in finished jobs: 0s 0.01m 0.00h 0.00d 0.000 y +#IO & Wait Time: 188s 3.13m 0.05h 0.00d 0.000 y +#Average job time: 3s 0.04m 0.00h 0.00d +#Longest finished job: 5s 0.08m 0.00h 0.00d +#Submission to last job: 7s 0.12m 0.00h 0.00d +#Estimated complete: 0s 0.00m 0.00h 0.00d + + + # now, we can date and process each of those .out files + # constructing the humanSpecific set of repeats + # this means repeats found in human, and not in others + # using mouse here for 'others' is good enough, a variety + # of other species could be used (rat dog cow) where they all + # produce the same result + mkdir dateRepeats + cd dateRepeats + cat << '_EOF_' > mkLSR +#!/bin/bash +set -beEu -o pipefail +seq=$1 +rm -f $seq.out_mus-musculus +ln -sf ../splitOut/$seq.out . +/scratch/data/RepeatMasker/DateRepeats $seq.out -query human -comp 'mus musculus' +rm $seq.out +mkdir -p ../humanSpecific +/cluster/bin/scripts/extractRepeats 1 $seq.out_mus-musculus \ + > ../humanSpecific/$seq.out.spec +_EOF_ + chmod +x mkLSR + + cat << '_EOF_' > template +#LOOP +./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec} +#ENDLOOP +_EOF_ + +# actually, I had to tweak the mkLSR script above to use the previous version of DateRepeats +# since RM RM4.1.2 has a bug with new development to use hdf5 file format and libraries. +# Since that bug should be fixed by the time the next person needs to DateRepeats again, +# I did not include the change here. + + gensub2 ../chrom.list single template jobList + para make jobList + + para time + +#Completed: 71 of 71 jobs +#CPU time in finished jobs: 294s 4.90m 0.08h 0.00d 0.000 y +#IO & Wait Time: 199s 3.32m 0.06h 0.00d 0.000 y +#Average job time: 7s 0.12m 0.00h 0.00d +#Longest finished job: 9s 0.15m 0.00h 0.00d +#Submission to last job: 14s 0.23m 0.00h 0.00d + + # We also need the nibs for blastz runs with lineage specific repeats + mkdir /hive/data/genomes/grcH38P14/bed/nibs + cd /hive/data/genomes/grcH38P14/bed/nibs + cut -f1 ../../chrom.sizes | while read C +do + twoBitToFa -seq=${C} ../../grcH38P14.2bit stdout \ + | faToNib -softMask stdin ${C}.nib + echo "${C} done" +done + + # verify nothing lost + cat ../../chrom.sizes \ + | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \ + | sh | faSize stdin +#27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1 files +#Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix.nib:0-34400) max 2101585 (chr5_MU273354v1_fix.nib:0-2101585) median 301310 +#N count: mean 3419.5 sd 28467.7 +#U count: mean 190290.6 sd 170338.0 +#L count: mean 187882.7 sd 164027.6 +#%49.24 masked total, %49.68 masked real + + + mkdir -p /hive/data/staging/data/grcH38P14/nib + rsync -a --progress ./ /hive/data/staging/data/grcH38P14/nib + rsync -a --progress /hive/data/genomes/grcH38P14/{grcH38P14.2bit,chrom.sizes} \ + /hive/data/staging/data/grcH38P14/ + + +##############################################################################