src/hg/makeDb/doc/hg38/grcH38P14.txt fa35e265cbbfb28dc28b754cdfd4e79c05b42970

fa35e265cbbfb28dc28b754cdfd4e79c05b42970
galt
  Fri Oct 14 19:49:24 2022 -0700
Finished genome build of database that has ONLY new fix and patch chromosomes. The first part of the hg38 patch14 build.

diff --git src/hg/makeDb/doc/hg38/grcH38P14.txt src/hg/makeDb/doc/hg38/grcH38P14.txt
new file mode 100644
index 0000000..c744bad
--- /dev/null
+++ src/hg/makeDb/doc/hg38/grcH38P14.txt
@@ -0,0 +1,445 @@
+# for emacs: -*- mode: sh; -*-
+
+##############################################################################
+# GRCh38 patch 14 build: build basic tracks on separate database for new
+# sequences only (relative to hg38).
+##############################################################################
+
+# run this first, then it continues the patch in patchUpdate.14.txt
+
+##############################################################################
+# download or rather ln -s the patch release files (DONE - 2022-09-13 - Galt)
+
+    # Note: newer assemblies use refseq releases instead of genbank, but hg38 uses genbank
+    # so continue with that when building patches.
+    mkdir -p /hive/data/genomes/grcH38P14/genbank
+    cd /hive/data/genomes/grcH38P14/genbank
+
+    # Releases have already been downloaded to /hive/data/outside/ncbi/genomes/.
+    ln -s /hive/data/outside/ncbi/genomes/GCA/000/001/405/GCA_000001405.29_GRCh38.p14/* .
+
+
+##############################################################################
+# Set up fasta and agp with UCSC names (DONE - 2022-09-23 - Galt)
+
+    mkdir /hive/data/genomes/grcH38P14/ucsc
+    cd /hive/data/genomes/grcH38P14/ucsc
+
+    # identify sequences not in existing genome db
+    faCount ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz \
+      > faCount.GRCh38.p14.txt
+    ~/kent/src/hg/makeDb/doc/hg38/scanAssemblyReport.pl \
+      /hive/data/genomes/hg38/chrom.sizes \
+      faCount.GRCh38.p14.txt ../genbank/GCA_000001405.29_GRCh38.p14_assembly_report.txt \
+    | grep -w new > new.sequences.list
+    wc -l new.sequences.list
+#71 new.sequences.list
+
+    # Extract UCSC-named FASTA for the new sequences
+    cut -f3 new.sequences.list > extract.new.list
+    awk '{printf "s/%s/%s/; ", $3,$1}' new.sequences.list > genbankToUCSC.sed
+    faSomeRecords ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz extract.new.list stdout \
+    | sed -e 's/ .*//;' \
+    | sed -f genbankToUCSC.sed \
+    | gzip -c > grcH38P14.fa.gz
+    faSize grcH38P14.fa.gz
+#27093089 bases (242788 N's 26850301 real 17087494 upper 9762807 lower) in 71 sequences in 1 files
+#Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585 (chr5_MU273354v1_fix) median 301310
+
+    # Compare faSize results for whole GCA_000001405.29_GRCh38.p14_genomic.fna.gz
+    # vs. concatenation of hg38 fasta and grcH38P14.fa.gz:
+    faSize ../genbank/GCA_000001405.29_GRCh38.p14_genomic.fna.gz
+#3298912062 bases (161611482 N's 3137300580 real 1945815910 upper 1191484670 lower) in 709 sequences in 1 files
+#Total size: mean 4652908.4 sd 25469887.9 min 970 (KI270394.1) max 248956422 (CM000663.2) median 171027
+
+
+    twoBitToFa /hive/data/genomes/hg38/hg38.2bit stdout \
+    | faSize grcH38P14.fa.gz stdin
+#3299210039 bases (161611482 N's 3137598557 real 1506729106 upper 1630869451 lower) in 711 sequences in 2 files
+#Total size: mean 4640239.2 sd 25435109.8 min 970 (chrUn_KI270394v1) max 248956422 (chr1) median 171027
+
+NOTE that 709 != 711.  711 - 709 == 2, so our counts are off by 2. 
+
+The reason that two fixes went missing is because they got a new subversion, and should apparently replace
+the original version.  Two of the fix patches has changed from version1 to version2, 
+and they have removed obsolete version1 from the genbank release.  
+
+There are some notes in /hive/data/genomes/grcH38P14/ucsc/galt.NOTES
+
+# in p13 and older releases
+chr11_KQ759759v1_fix
+chr22_KQ759762v1_fix
+
+So, these just became updated as v2 in the new assembly?
+HG1311_HG2539_PATCH     fix-patch       22      Chromosome      KQ759762.2      <>      na      PATCHES 101040  na
+HG107_HG2565_PATCH      fix-patch       11      Chromosome      KQ759759.2      <>      na      PATCHES 204999  na
+
+and in the new one we have:
+chr11_KQ759759v2_fix
+chr22_KQ759762v2_fix
+
+[galt@hgwdev ucsc]$ egrep '(KQ759759|KQ759762)' new.sequences.list
+chr11_KQ759759v2_fix    204999  KQ759759.2      new
+chr22_KQ759762v2_fix    101040  KQ759762.2      new
+
+Since the removed sequences are in the main hg38.2bit, but not in the new patch14 sequences,
+we can just continue the GRCh38 patch 14 build which is not affected. 
+We might delete those 2 with old versions in patchUpdate.14.txt which is run after this build doc.
+
+    # Good, everything in GCA_000001405.29_GRCh38.p14_genomic.fna.gz is accounted for
+    # between hg38.2bit and grcH38P14.fa.gz
+
+    # Make UCSC-named AGP:
+    zcat ../genbank/GCA_000001405.29_GRCh38.p14_assembly_structure/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz \
+    | grep -Fwf extract.new.list \
+    | sed -f genbankToUCSC.sed > grcH38P14.agp
+
+    # construct 2bit file:
+    cd /hive/data/genomes/grcH38P14
+    faToTwoBit ucsc/grcH38P14.fa.gz grcH38P14.unmasked.2bit
+    twoBitInfo grcH38P14.unmasked.2bit stdout | sort -k2nr > chrom.sizes
+    # take a look at chrom.sizes to verify it looks OK.
+
+    # Make sure AGP and FASTA/2bit agree:
+    checkAgpAndFa ucsc/grcH38P14.agp grcH38P14.unmasked.2bit | tail -1
+#All AGP and FASTA entries agree - both files are valid
+
+
+##############################################################################
+# establish config.ra file (DONE - Galt - 2021-09-23)
+    # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt>
+    cd /hive/data/genomes/grcH38P14
+    # Must make photoReference.txt first -- copy from hg38
+    cp /hive/data/genomes/hg38/photoReference.txt .
+    $HOME/kent/src/hg/utils/automation/prepConfig.pl grcH38P14 haplotypes \
+        GRCh38.p14 genbank/*_assembly_report.txt > grcH38P14.config.ra
+    # Edit grcH38P14.config.ra to avoid confusion with actual hg38
+assemblyDate Feb. 2019 p14
+orderKey 2001
+
+# NOTE FROM GALT, a way to record it in the makedoc with comments so we know what values were used?
+    sed -e 's/^/#/' grcH38P14.config.ra
+## config parameters for makeGenomeDb.pl:
+#db grcH38P14
+#clade haplotypes
+#scientificName Homo sapiens
+#commonName Human
+#assemblyDate Feb. 2019 p14
+#assemblyLabel Genome Reference Consortium
+#assemblyShortLabel GRCh38.p14
+#orderKey 2001
+## mitochondrial sequence included in refseq release
+## mitoAcc J01415.2
+#mitoAcc none
+#fastaFiles /hive/data/genomes/grcH38P14/ucsc/*.fa.gz
+#agpFiles /hive/data/genomes/grcH38P14/ucsc/*.agp
+## qualFiles none
+#dbDbSpeciesDir GRCh38.p14
+#photoCreditURL http://www.cbse.ucsc.edu/
+#photoCreditName        Graphic courtesy of CBSE
+#ncbiGenomeId 51
+#ncbiAssemblyId 11968211
+#ncbiAssemblyName GRCh38.p14
+#ncbiBioProject 31257
+#ncbiBioSample notFound
+#genBankAccessionID GCA_000001405.29
+#taxId 9606
+
+
+##############################################################################
+#  Initial database build (DONE - 2022-09-23 - Galt)
+
+    cd /hive/data/genomes/grcH38P14
+    # AGP and unmasked.2bit are already built and checked, so start at the db step:
+    mkdir jkStuff
+    $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl grcH38P14.config.ra -debug
+#HgStepManager: executing from step 'seq' through step 'trackDb'.
+#HgStepManager: executing step 'seq' Fri Sep 23 16:26:50 2022.
+#seq: looks like this was run successfully already (/cluster/data/grcH38P14/chrom.sizes exists).  Either run with -continue agp or some later step, or move aside/remove /cluster/data/grcH38P14/chrom.sizes and run again.
+
+
+    # Make chromInfo.tab.
+    mkdir -p bed/chromInfo
+    awk '{print $1 "\t" $2 "\t/gbdb/grcH38P14/grcH38P14.2bit";}' chrom.sizes \
+      > bed/chromInfo/chromInfo.tab
+    # Make a link to the .agp file where makeGenomeDb.pl expects to find it.
+    ln -s ucsc/grcH38P14.agp .
+
+    # Skip 'seq' and 'agp' steps because those were done above.
+    time ($HOME/kent/src/hg/utils/automation/makeGenomeDb.pl \
+          -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \
+          -continue=db grcH38P14.config.ra) > db.log 2>&1 &
+    tail -f db.log
+#real    0m22.790s
+
+    # actually, I had to add uniprot.ra to git archive step list in $HOME/kent/src/hg/utils/automation/makeGenomeDb.pl
+    # then I restarted it at the dbDb step.
+
+    # Ignore all the "NOTES -- STUFF THAT YOU WILL HAVE TO DO --" stuff because this is
+    # going to be folded into hg38.
+    # Now the gold, gap and gc5BaseBw tracks are built.
+
+#############################################################################
+# RepeatMasker (DONE - 2022-09-23 - Galt)
+    mkdir /hive/data/genomes/grcH38P14/bed/repeatMasker
+    cd /hive/data/genomes/grcH38P14/bed/repeatMasker
+    time  ($HOME/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \
+        -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \
+        -smallClusterHub=ku grcH38P14) > do.log 2>&1 &
+    tail -f do.log
+#real    49m1.583s
+
+    egrep "bases|Total|masked" faSize.rmsk.txt \
+    | fold -w 95 -s | sed -e 's/^/# /;'
+# 27093089 bases (242788 N's 26850301 real 13541432 upper 13308869 lower) in 71 sequences in 1
+# files
+# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585
+# (chr5_MU273354v1_fix) median 301310
+# %49.12 masked total, %49.57 masked real
+
+
+    egrep -i "versi|relea" do.log | sed -e 's/^/# /;'
+# VERSION:\n
+# /cluster/software/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.
+# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
+# RepeatMasker version 4.1.2-p1
+# grep RELEASE /hive/data/staging/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
+# CC    Artefacts RELEASE 20190301;                                 *
+# CC    Dfam RELEASE Dfam_3.1;           
+
+    featureBits -countGaps grcH38P14 rmsk
+#13308822 bases of 27093089 (49.123%) in intersection
+
+
+##########################################################################
+# running simple repeat (DONE - 2022-09-23 - Galt)
+
+    mkdir /hive/data/genomes/grcH38P14/bed/simpleRepeat
+    cd /hive/data/genomes/grcH38P14/bed/simpleRepeat
+    # using trf409 6 here like hg38
+    time ($HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl -buildDir=`pwd` \
+        -dbHost=hgwdev -workhorse=hgwdev -bigClusterHub=ku -smallClusterHub=ku \
+        -trf409 6 grcH38P14) > do.log 2>&1 &
+#real    1m37.386s
+
+    cat fb.simpleRepeat
+#942219 bases of 26850301 (3.509%) in intersection
+
+    # adding this trfMask to .rmsk.2bit
+    cd /hive/data/genomes/grcH38P14
+    twoBitMask grcH38P14.rmsk.2bit \
+        -add bed/simpleRepeat/trfMask.bed grcH38P14.2bit
+    #   you can safely ignore the warning about fields >= 13
+
+    twoBitToFa grcH38P14.2bit stdout | faSize stdin > faSize.grcH38P14.2bit.txt
+    egrep "bases|Total|masked" faSize.grcH38P14.2bit.txt \
+    | fold -w 95 -s  | sed -e 's/^/# /;'
+# 27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1
+# files
+# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585
+# (chr5_MU273354v1_fix) median 301310
+# %49.24 masked total, %49.68 masked real
+
+
+    # reset the symlink
+    ln -sf `pwd`/grcH38P14.2bit /gbdb/grcH38P14/grcH38P14.2bit
+
+
+##########################################################################
+## WINDOWMASKER (DONE - 2022-09-26 - Galt)
+
+    mkdir /hive/data/genomes/grcH38P14/bed/windowMasker
+    cd /hive/data/genomes/grcH38P14/bed/windowMasker
+    time ($HOME/kent/src/hg/utils/automation/doWindowMasker.pl -buildDir=`pwd` \
+         -workhorse=hgwdev -dbHost=hgwdev grcH38P14) > do.log 2>&1
+# *** All done ! - Elapsed time: 0m47s
+
+    featureBits -countGaps grcH38P14 rmsk windowmaskerSdust \
+      > fb.grcH38P14.rmsk.windowmaskerSdust.txt 2>&1
+    cat fb.grcH38P14.rmsk.windowmaskerSdust.txt 
+#5549307 bases of 27093089 (20.482%) in intersection
+
+    # Masking statistics
+    egrep "bases|Total|masked" faSize.grcH38P14.cleanWMSdust.txt \
+    | fold -w 95 -s  | sed -e 's/^/# /;'
+# 27093089 bases (242788 N's 26850301 real 19806463 upper 7043838 lower) in 71 sequences in 1
+# files
+# Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix) max 2101585
+# (chr5_MU273354v1_fix) median 301310
+# %26.00 masked total, %26.23 masked real
+
+#############################################################################
+# cytoBandIdeo - (DONE - 2022-09-26 - Galt)
+    mkdir /hive/data/genomes/grcH38P14/bed/cytoBand
+    cd /hive/data/genomes/grcH38P14/bed/cytoBand
+    makeCytoBandIdeo.csh grcH38P14
+
+
+#############################################################################
+# cpgIslands - (DONE - 2022-09-26 - Galt)
+    mkdir /hive/data/genomes/grcH38P14/bed/cpgIslands
+    cd /hive/data/genomes/grcH38P14/bed/cpgIslands
+    time ($HOME/kent/src/hg/utils/automation/doCpgIslands.pl -dbHost=hgwdev \
+      -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku grcH38P14) > do.log 2>&1 &
+# *** All done !  Elapsed time: 1m10s
+# *** All done !  Elapsed time: 0m19s
+
+    cat fb.grcH38P14.cpgIslandExt.txt
+#394646 bases of 26850301 (1.470%) in intersection
+
+
+##############################################################################
+# genscan - (DONE - 2022-09-26 - Galt)
+    mkdir /hive/data/genomes/grcH38P14/bed/genscan
+    cd /hive/data/genomes/grcH38P14/bed/genscan
+    time ($HOME/kent/src/hg/utils/automation/doGenscan.pl -buildDir=`pwd` \
+       -workhorse=hgwdev -dbHost=hgwdev -bigClusterHub=ku grcH38P14) > do.log 2>&1 &
+# *** All done !  Elapsed time: 2m44s
+
+    cat fb.grcH38P14.genscan.txt
+#1046433 bases of 26850301 (3.897%) in intersection
+
+    cat fb.grcH38P14.genscanSubopt.txt
+#844615 bases of 26850301 (3.146%) in intersection
+
+
+#############################################################################
+# augustus gene track (DONE - 2022-09-26 - Galt)
+
+    mkdir /hive/data/genomes/grcH38P14/bed/augustus
+    cd /hive/data/genomes/grcH38P14/bed/augustus
+    time ($HOME/kent/src/hg/utils/automation/doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \
+     -species=human -dbHost=hgwdev -workhorse=hgwdev grcH38P14) > do.log 2>&1 &
+# *** All done !  Elapsed time: 11m42s
+
+    cat fb.grcH38P14.augustusGene.txt
+#972943 bases of 26850301 (3.624%) in intersection
+
+#############################################################################
+
+# make the tracks in your browser using the temporary checkout:
+
+pushd /hive/data/genomes/grcH38P14/TemporaryTrackDbCheckout/kent/src/hg/src/makeDb/trackDb/
+./loadTracks trackDb_\${USER} hgFindSpec_\${USER} grcH38P14
+popd
+
+# check out your browser
+https://hgwdev-${USER}.gi.ucsc.edu/cgi-bin/hgTracks?db=grcH38P14
+
+##############################################################################
+# Download files (DONE - 2022-09-26 - Galt)
+    cd /hive/data/genomes/grcH38P14
+    time ($HOME/kent/src/hg/utils/automation/makeDownloads.pl -noChromFiles \
+      -workhorse=hgwdev grcH38P14) > downloads.log 2>&1 &
+# *** All done!
+#real    0m24.688s
+
+
+##############################################################################
+# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR LASTZ (DONE - 2022-10-14 - Galt)
+    mkdir /hive/data/genomes/grcH38P14/bed/linSpecRep
+    cd /hive/data/genomes/grcH38P14/bed/linSpecRep
+    #	create individual .out files from the master record in ../repeatMasker
+    # cluster job is perhaps overkill for just the patches, but it works
+    mkdir splitOut
+    cat <<'_EOF_' > split.csh
+#!/bin/csh -fe
+set C = $1
+head -3 ../repeatMasker/grcH38P14.sorted.fa.out > splitOut/${C}.out
+grep "${C} " ../repeatMasker/grcH38P14.sorted.fa.out >> splitOut/${C}.out
+_EOF_
+    chmod +x split.csh
+
+    cat << '_EOF_' > template
+#LOOP
+split.csh $(root1) {check out line+ splitOut/$(root1).out}
+#ENDLOOP
+_EOF_
+    # small ones first:
+    cut -f1 ../../chrom.sizes | tac > chrom.list
+    gensub2 chrom.list single template jobList
+    para make jobList
+    para time
+#Completed: 71 of 71 jobs
+#CPU time in finished jobs:          0s       0.01m     0.00h    0.00d  0.000 y
+#IO & Wait Time:                   188s       3.13m     0.05h    0.00d  0.000 y
+#Average job time:                   3s       0.04m     0.00h    0.00d
+#Longest finished job:               5s       0.08m     0.00h    0.00d
+#Submission to last job:             7s       0.12m     0.00h    0.00d
+#Estimated complete:                 0s       0.00m     0.00h    0.00d
+
+
+    #	now, we can date and process each of those .out files
+    #	constructing the humanSpecific set of repeats
+    #   this means repeats found in human, and not in others
+    #   using mouse here for 'others' is good enough, a variety
+    #   of other species could be used (rat dog cow) where they all
+    #   produce the same result
+    mkdir dateRepeats
+    cd dateRepeats
+    cat << '_EOF_' > mkLSR
+#!/bin/bash
+set -beEu -o pipefail
+seq=$1
+rm -f $seq.out_mus-musculus
+ln -sf ../splitOut/$seq.out .
+/scratch/data/RepeatMasker/DateRepeats $seq.out -query human -comp 'mus musculus'
+rm $seq.out
+mkdir -p ../humanSpecific
+/cluster/bin/scripts/extractRepeats 1 $seq.out_mus-musculus \
+  > ../humanSpecific/$seq.out.spec
+_EOF_
+    chmod +x mkLSR
+
+    cat << '_EOF_' > template
+#LOOP
+./mkLSR $(path1) {check out line+ ../humanSpecific/$(path1).out.spec}
+#ENDLOOP
+_EOF_
+
+# actually, I had to tweak the mkLSR script above to use the previous version of DateRepeats
+# since RM RM4.1.2 has a bug with new development to use hdf5 file format and libraries.
+# Since that bug should be fixed by the time the next person needs to DateRepeats again,
+# I did not include the change here.
+
+    gensub2 ../chrom.list single template jobList
+    para make jobList
+
+    para time
+
+#Completed: 71 of 71 jobs
+#CPU time in finished jobs:        294s       4.90m     0.08h    0.00d  0.000 y
+#IO & Wait Time:                   199s       3.32m     0.06h    0.00d  0.000 y
+#Average job time:                   7s       0.12m     0.00h    0.00d
+#Longest finished job:               9s       0.15m     0.00h    0.00d
+#Submission to last job:            14s       0.23m     0.00h    0.00d
+
+    # We also need the nibs for blastz runs with lineage specific repeats
+    mkdir /hive/data/genomes/grcH38P14/bed/nibs
+    cd /hive/data/genomes/grcH38P14/bed/nibs
+    cut -f1 ../../chrom.sizes | while read C
+do
+    twoBitToFa -seq=${C} ../../grcH38P14.2bit stdout \
+    | faToNib -softMask stdin ${C}.nib
+    echo "${C} done"
+done
+
+    # verify nothing lost
+    cat ../../chrom.sizes \
+     | awk '{printf "nibFrag -masked %s.nib 0 %d + stdout\n", $1, $2}' \
+        | sh | faSize stdin
+#27093089 bases (242788 N's 26850301 real 13510632 upper 13339669 lower) in 71 sequences in 1 files
+#Total size: mean 381592.8 sd 343498.2 min 34400 (chr5_MU273352v1_fix.nib:0-34400) max 2101585 (chr5_MU273354v1_fix.nib:0-2101585) median 301310
+#N count: mean 3419.5 sd 28467.7
+#U count: mean 190290.6 sd 170338.0
+#L count: mean 187882.7 sd 164027.6
+#%49.24 masked total, %49.68 masked real
+
+
+    mkdir -p /hive/data/staging/data/grcH38P14/nib
+    rsync -a --progress ./ /hive/data/staging/data/grcH38P14/nib
+    rsync -a --progress /hive/data/genomes/grcH38P14/{grcH38P14.2bit,chrom.sizes} \
+      /hive/data/staging/data/grcH38P14/
+
+
+##############################################################################