2124c0c0e1b0fd852f6078625627742252405af6 hiram Mon Jun 29 10:08:47 2020 -0700 started build then became obsolete refs #23111 diff --git src/hg/makeDb/doc/perManBai1/initialBuild.sh src/hg/makeDb/doc/perManBai1/initialBuild.sh new file mode 100644 index 0000000..3fe1523 --- /dev/null +++ src/hg/makeDb/doc/perManBai1/initialBuild.sh @@ -0,0 +1,941 @@ +# for emacs: -*- mode: sh; -*- + +# This file describes browser build for perManBai1 + +# GCF_000500345.1_Pman_1.0_assembly_report.txt + +# Assembly name: Pman_1.0 +# Organism name: Peromyscus maniculatus bairdii (prairie deer mouse) +# Taxid: 230844 +# BioSample: SAMN00848614 +# BioProject: PRJNA53563 +# Submitter: Baylor College of Medicine +# Date: 2013-12-3 +# Assembly type: haploid +# Release type: major +# Assembly level: Scaffold +# Genome representation: full +# WGS project: AYHN01 +# Assembly method: Newbler v. 2.3 and 2.5; AllPaths v. 41070; ATLAS-gapfill v. 2.2; ATLAS-link v. 1.0 +# Genome coverage: 110.0x +# Sequencing technology: FLX 454; Illumina HiSeq +# RefSeq category: Representative Genome +# GenBank assembly accession: GCA_000500345.1 +# RefSeq assembly accession: GCF_000500345.1 +# RefSeq assembly and GenBank assemblies identical: yes +# +## Assembly-Units: +## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name +## GCA_000500355.1 GCF_000500355.1 Primary Assembly + +############################################################################# +# obtain photograph (DONE - 2017-02-16 - Hiram) + mkdir -p /hive/data/genomes/perManBai1/photo + cd /hive/data/genomes/perManBai1/photo + + wget -O photoFile.jpg 'https://upload.wikimedia.org/wikipedia/commons/1/18/Deer_Mouse_Portal_%28Peromyscus_maniculatus%29_%2815722414288%29.jpg' + + convert -quality 80 -sharpen 0 -normalize -geometry 400x400 \ + photoFile.jpg Peromyscus_maniculatus_bairdii.jpg + + printf "photoCreditURL https://www.flickr.com/people/22170893@N06?rb=1 +photoCreditName Gregory "Slobirdr" Smith (flickr)\n" > ../photoReference.txt + + cat ../photoReference.txt +photoCreditURL https://www.flickr.com/people/22170893@N06?rb=1 +photoCreditName Gregory "Slobirdr" Smith (flickr) + +######################################################################### +# Initial steps (DONE - 2019-03-11 - Hiram) + +# To start this initialBuild.txt document, from a previous assembly document: + +mkdir ~/kent/src/hg/makeDb/doc/perManBai1 +cd ~/kent/src/hg/makeDb/doc/perManBai1 + +# best to use a most recent document since it has the latest features and +# procedures: + +sed -e 's/ambMex1/perManBai1/g; s/AmbMex1/PerManBai1/g; s/DONE/TBD/g;' ../ambMex1/initialBuild.txt > initialBuild.txt + +############################################################################# +# fetch sequence from new style download directory (DONE - 2019-03-11 - Hiram) + mkdir -p /hive/data/genomes/perManBai1/refseq + cd /hive/data/genomes/perManBai1/refseq + + time rsync -L -a -P --stats \ +rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Peromyscus_maniculatus/all_assembly_versions/GCF_000500345.1_Pman_1.0/ ./ + + # real 0m51.781s + + # measure what we have here, this one is hefty: + faSize GCF_000500345.1_Pman_1.0_genomic.fna.gz + +# 2630541020 bases (157004803 N's 2473536217 real 1695858008 upper +# 777678209 lower) in 30921 sequences in 1 files +# Total size: mean 85073.0 sd 633290.8 min 201 (NW_006521020.1) +# max 18898765 (NW_006501040.1) median 2048 +# %29.56 masked total, %31.44 masked real + +############################################################################# +# fixup to UCSC naming scheme (DONE - 2019-03-12 - Hiram) + mkdir /hive/data/genomes/perManBai1/ucsc + cd /hive/data/genomes/perManBai1/ucsc + + # verify no duplicate sequences: note the use of the -long argument + # on this gigantic amount of sequence + time faToTwoBit ../refseq/*0_genomic.fna.gz refseq.2bit + # real 0m47.359s + + time twoBitDup refseq.2bit + # real 0m10.911s + # should be silent output, otherwise the duplicates need to be removed + + # since this is an unplaced contig assembly, verify all names are .1: + twoBitInfo refseq.2bit stdout | awk '{print $1}' \ + | sed -e 's/[0-9]\+//;' | sort | uniq -c + # 30921 NW_.1 + + # in this case, all the .1's can be changed to: v1 + time zcat ../refseq/G*0_genomic.fna.gz \ + | sed -e 's/.1 Peromyscus.*/v1/;' | gzip -c \ + > ucsc.perManBai1.fa.gz +# -rw-rw-r-- 1 810656363 Mar 12 15:11 ucsc.perManBai1.fa.gz + + zgrep -v "^#" ../refseq/GCF_000500345.1_Pman_1.0_assembly_structure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ + | sed -e "s/.1\t/v1\t/;" > ucsc.perManBai1.agp + + # Just discovered there is a brand new RefSeq chrM sequence + # available. Will use this accession in the config.ra file: + mitoAcc="NC_039921.1" + + # verify fasta and AGP match: + time faToTwoBit ucsc.perManBai1.fa.gz test.2bit + # real 0m53.987s + + # verify still silent: + time twoBitDup test.2bit + # real 0m10.005s + + # and check AGP vs. fasta correspondence: + time checkAgpAndFa *.agp test.2bit 2>&1 | tail + # All AGP and FASTA entries agree - both files are valid + # real 0m4.913s + + # verify nothing lost compared to genbank: + time twoBitToFa test.2bit stdout | faSize stdin +# 2630541020 bases (157004803 N's 2473536217 real 1695858008 upper +# 777678209 lower) in 30921 sequences in 1 files +# Total size: mean 85073.0 sd 633290.8 min 201 (NW_006521020v1) +# max 18898765 (NW_006501040v1) median 2048 +# %29.56 masked total, %31.44 masked real + # real 0m30.309s + + # the original refseq count is the same: +# 2630541020 bases (157004803 N's 2473536217 real 1695858008 upper +# 777678209 lower) in 30921 sequences in 1 files +# 32393605577 bases (4026911109 N's 28366694468 real 28365740082 upper +# 954386 lower) in 125724 sequences in 1 files + + # no longer needed: + rm -f refseq.2bit test.2bit + +############################################################################# +# idKeys for genbank assembly + mkdir /hive/data/genomes/perManBai1/genbank/idKeys + cd /hive/data/genomes/perManBai1/genbank/idKeys + ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Peromyscus_maniculatus/all_assembly_versions/GCA_000500345.1_Pman_1.0/GCA_000500345.1_Pman_1.0_genomic.fna.gz . + + faToTwoBit GCA_000500345.1_Pman_1.0_genomic.fna.gz genbankPerManBai1.2bit + doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/genbankPerManBai1.2bit \ + genbankPerManBai1 + +############################################################################# +# Initial database build (DONE - 2019-03-13 - Hiram) + + cd /hive/data/genomes/perManBai1 + + # establish the config.ra file: + # usually would use this + ~/kent/src/hg/utils/automation/prepConfig.pl perManBai1 mammal mouse \ + refseq/*_assembly_report.txt > perManBai1.config.ra + # going to need a mitoAcc ? + # add the mito acc: mitoAcc NC_039921.1 + + # verify this looks OK: + + cat perManBai1.config.ra +# config parameters for makeGenomeDb.pl: +db perManBai1 +clade mammal +scientificName Peromyscus maniculatus bairdii +commonName Prairie deer mouse +assemblyDate Dec. 2013 +assemblyLabel Baylor College of Medicine +assemblyShortLabel Pman_1.0 +orderKey 16755 +mitoAcc NC_039921.1 +fastaFiles /hive/data/genomes/perManBai1/ucsc/*.fa.gz +agpFiles /hive/data/genomes/perManBai1/ucsc/*.agp +# qualFiles none +dbDbSpeciesDir mouse +photoCreditURL https://www.flickr.com/people/22170893@N06?rb=1 +photoCreditName Gregory "Slobirdr" Smith (flickr) +ncbiGenomeId 11397 +ncbiAssemblyId 392808 +ncbiAssemblyName Pman_1.0 +ncbiBioProject 53563 +ncbiBioSample SAMN00848614 +genBankAccessionID GCF_000500345.1 +taxId 230844 + + time (~/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -stop=seq perManBai1.config.ra) > seq.log 2>&1 + # real 2m19.045s + + time (~/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -continue=agp -stop=agp perManBai1.config.ra) > agp.log 2>&1 + # real 0m9.644s + # verify it ran OK: + # *** All done! (through the 'agp' step) + time (~/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -continue=db -stop=db perManBai1.config.ra) > db.log 2>&1 + # real 13m40.220s + # real 0m21.249s + + time (~/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -continue=dbDb perManBai1.config.ra) > dbDb.log 2>&1 + # real 0m6.977s + + time (~/kent/src/hg/utils/automation/makeGenomeDb.pl \ + -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ + -continue=trackDb perManBai1.config.ra) > trackDb.log 2>&1 + # real 0m7.158s + + # check in the trackDb files created and add to trackDb/makefile + # temporary symlink until after masking + ln -s `pwd`/perManBai1.unmasked.2bit /gbdb/perManBai1/perManBai1.2bit + +############################################################################# +# cytoBandIdeo - (DONE - 2019-03-13 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/cytoBand + cd /hive/data/genomes/perManBai1/bed/cytoBand + time makeCytoBandIdeo.csh perManBai1 + # real 0m0.804s + +############################################################################## +# cpgIslands on UNMASKED sequence (DONE - 2019-03-13 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/cpgIslandsUnmasked + cd /hive/data/genomes/perManBai1/bed/cpgIslandsUnmasked + + time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ + -tableName=cpgIslandExtUnmasked \ + -maskedSeq=/hive/data/genomes/perManBai1/perManBai1.unmasked.2bit \ + -workhorse=hgwdev -smallClusterHub=ku perManBai1) > do.log 2>&1 + # real 11m18.290s + + cat fb.perManBai1.cpgIslandExtUnmasked.txt + # 14676098 bases of 2473552766 (0.593%) in intersection + +############################################################################# +# running repeat masker (DONE - 2019-03-13 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/repeatMasker + cd /hive/data/genomes/perManBai1/bed/repeatMasker + time (doRepeatMasker.pl -buildDir=`pwd` \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -smallClusterHub=ku perManBai1) > do.log 2>&1 & +XXX - running - Wed Mar 13 10:45:49 PDT 2019 + # curiously, ran very fast ? There are hardly any repeats in the templates + # used by RM +# Completed: 71471 of 71471 jobs +# CPU time in finished jobs: 7334652s 122244.20m 2037.40h 84.89d 0.233 y +# IO & Wait Time: 217376s 3622.93m 60.38h 2.52d 0.007 y +# Average job time: 106s 1.76m 0.03h 0.00d +# Longest finished job: 512s 8.53m 0.14h 0.01d +# Submission to last job: 11477s 191.28m 3.19h 0.13d + + cat faSize.rmsk.txt +# 2400839308 bases (53716773 N's 2347122535 real 1331243472 upper +# 1015879063 lower) in 7872 sequences in 1 files +# Total size: mean 304984.7 sd 3152710.5 min 1000 (NW_018732762v1) +# max 84771923 (NW_018734349v1) median 1998 +# %42.31 masked total, %43.28 masked real + + egrep -i "versi|relea" do.log + # RepeatMasker version open-4.0.5 + # January 31 2015 (open-4-0-5) version of RepeatMasker + # CC RELEASE 20140131; + + time featureBits -countGaps perManBai1 rmsk + # 1016100984 bases of 2400839308 (42.323%) in intersection + # real 0m44.536s + + # why is it different than the faSize above ? + # because rmsk masks out some N's as well as bases, the count above + # separates out the N's from the bases, it doesn't show lower case N's + + # faster way to get the same result on high contig count assemblies:: + time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' perManBai1 \ + | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" + # total 1016100984.000000 + # real 0m36.665s + +########################################################################## +# running simple repeat (DONE - 2019-03-13 - Hiram) + + mkdir /hive/data/genomes/perManBai1/bed/simpleRepeat + cd /hive/data/genomes/perManBai1/bed/simpleRepeat + time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ + -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ + -trf409=5 perManBai1) > do.log 2>&1 & +XXX - running - Wed Mar 13 10:46:18 PDT 2019 + # real 26m16.578s + + cat fb.simpleRepeat + # 1399128075 bases of 28366694468 (4.932%) in intersection + + # add to this simple repeat to windowMasker: + cd /hive/data/genomes/perManBai1 + /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask -add \ + bed/windowMasker/splitFa/perManBai1.cleanWMSdust.2bit \ + bed/simpleRepeat/trfMask.bed perManBai1.2bit + + twoBitToFa perManBai1.2bit stdout | faSize stdin > faSize.perManBai1.2bit.txt + + # real 9m59.223s + + cat faSize.perManBai1.2bit.txt + egrep "bases|Total|masked" faSize.perManBai1.2bit.txt \ + | fold -w 75 -s | sed -e 's/^/# /;' +# 32393605577 bases (4026911109 N's 28366694468 real 18209209746 upper +# 10157484722 lower) in 125724 sequences in 1 files +# Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832v1) max +# 21669615 (PGSH01077959v1) median 47471 +# %31.36 masked total, %35.81 masked real + + # reset symlink + rm /gbdb/perManBai1/perManBai1.2bit + ln -s `pwd`/perManBai1.2bit /gbdb/perManBai1/perManBai1.2bit + +############################################################################# +# CREATE MICROSAT TRACK (TBD - 2018-07-08 - Hiram) + ssh hgwdev + mkdir /cluster/data/perManBai1/bed/microsat + cd /cluster/data/perManBai1/bed/microsat + + awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ + ../simpleRepeat/simpleRepeat.bed > microsat.bed + + hgLoadBed perManBai1 microsat microsat.bed + # Read 56937 elements of size 4 from microsat.bed + +############################################################################# +# ucscToINSDC table/track (TBD - 2018-07-10 - Hiram) + # this is simple since there isn't any RefSeq assembly, and the + # names used here are different from INSDC only due to the .1 -> v1 switch + + cd /hive/data/genomes/perManBai1/bed/ucscToINSDC + + cut -f1 ../../chrom.sizes | awk '{printf "%s\t%s\n", $1, $1};' \ + | sed -e 's/v1/.1/;' | awk '{printf "%s\t%s\n", $2, $1}' \ + | sort > ucsc.insdc.txt + + join -t$'\t' <(sort -k1,1 ../../chrom.sizes) ucsc.insdc.txt \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ + | sort -k1,1 -k2,2n > ucscToINSDC.bed + + cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 + # 14 + # use the 14 in this sed + sed -e "s/21/14/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | hgLoadSqlTab perManBai1 ucscToINSDC stdin ucscToINSDC.bed + + # should cover %100 entirely: + time featureBits -countGaps perManBai1 ucscToINSDC + # 32393605577 bases of 32393605577 (100.000%) in intersection + # real 0m47.018s + +########################################################################## +# run up idKeys files for chromAlias (TBD - 2018-07-08 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/idKeys + cd /hive/data/genomes/perManBai1/bed/idKeys + + time (doIdKeys.pl -twoBit=/hive/data/genomes/perManBai1/perManBai1.unmasked.2bit -buildDir=`pwd` perManBai1) > do.log 2>&1 & + # real 60m51.086s + + cat perManBai1.keySignature.txt + # 5c21fa448b2fcbcd8249a2b5d84bf460 + +########################################################################## +# add chromAlias table (TBD - 2018-07-10 - Hiram) + + mkdir /hive/data/genomes/perManBai1/bed/chromAlias + cd /hive/data/genomes/perManBai1/bed/chromAlias + + hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' perManBai1 \ + > ucsc.genbank.tab + + ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ + > perManBai1.chromAlias.tab + +for t in genbank +do + c0=`cat ucsc.$t.tab | wc -l` + c1=`grep $t perManBai1.chromAlias.tab | wc -l` + ok="OK" + if [ "$c0" -ne "$c1" ]; then + ok="ERROR" + fi + printf "# checking $t: $c0 =? $c1 $ok\n" +done +# checking genbank: 125724 =? 125724 OK + + + hgLoadSqlTab perManBai1 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ + perManBai1.chromAlias.tab + +######################################################################### +# fixup search rule for assembly track/gold table (TBD - 2018-07-10 - Hiram) + + cd ~/kent/src/hg/makeDb/trackDb/axolotl/perManBai1 + # preview prefixes and suffixes: + hgsql -N -e "select frag from gold;" perManBai1 \ + | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c + # sample of output: +# 114752 PGSHv1 +# 10972 PGSHv1_1 +# 8870 PGSHv1_10 + # ... etc ... + +PGSH + # implies a search rule of: 'PGSH[0-9]+(v1_[0-9]+)?' + + # verify this rule will find them all or eliminate them all: + hgsql -N -e "select frag from gold;" perManBai1 | wc -l + # 892108 + + hgsql -N -e "select frag from gold;" perManBai1 \ + | egrep -e 'PGSH[0-9]+(v1_[0-9]+)?' | wc -l + # 892108 + + hgsql -N -e "select frag from gold;" perManBai1 \ + | egrep -v -e 'PGSH[0-9]+(v1_[0-9]+)?' | wc -l + # 0 + + # hence, add to trackDb/rhesus/perManBai1/trackDb.ra +searchTable gold +shortCircuit 1 +termRegex PGSH[0-9]+(v1_[0-9]+)? +query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' +searchPriority 8 + + git commit -m 'add gold table search rule refs #19859' trackDb.ra + + # verify searches work in the position box + +########################################################################## +## WINDOWMASKER (TBD - 2019-03-13 - Hiram) + + mkdir /hive/data/genomes/perManBai1/bed/windowMasker + cd /hive/data/genomes/perManBai1/bed/windowMasker + time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ + -dbHost=hgwdev perManBai1) > do.log 2>&1 + # real 2857m15.802s + + # broke down due to broken twoBitMask situation + # this procedure was carried out on a split setup of sequences + mkdir /hive/data/genomes/perManBai1/bed/windowMasker/splitFa + cd /hive/data/genomes/perManBai1/bed/windowMasker/splitFa + # generate lists of contigs up to 2Gb in size + ~/kent/src/hg/makeDb/doc/perManBai1/twoGbChunks.pl + # turning to fasta done in a small cluster job on hgwdev + printf '#!/bin/bash +# input is one of chunkN.list +export c=$1 +export chunk=`echo $c | sed -e 's/.list//;'` + +fa="$chunk/$chunk.fa.gz" +mkdir -p $chunk +# echo $c $fa +faSomeRecords perManBai1.fa $c stdout | gzip -c > ${fa} +' > mkChunkFa.sh + chmod +x mkChunkFa.sh + + printf '#LOOP +./mkChunkFa.sh $(path1) +#ENDLOOP +' > split.template + ls chunk*.list > run.list + gensub2 run.list single split.template jobList + para create jobList + para push + cat run.time +# Completed: 16 of 16 jobs +# CPU time in finished jobs: 11256s 187.59m 3.13h 0.13d 0.000 y +# IO & Wait Time: 0s 0.00m 0.00h 0.00d 0.000 y +# Average job time: 679s 11.31m 0.19h 0.01d +# Longest finished job: 790s 13.17m 0.22h 0.01d +# Submission to last job: 795s 13.25m 0.22h 0.01d + # then turn those fasta into 2bit files: + ls -d chunk[0-9] chunk1[0-9] | while read D +do + ls -ld ${D}/*.fa.gz + echo faToTwoBit ${D}/${D}.fa.gz ${D}/${D}.2bit + faToTwoBit ${D}/${D}.fa.gz ${D}/${D}.2bit & +done +wait + # Then, a second batch to run WM on each chunk + printf '#!/bin/bash + +set -beEu -o pipefail + +export chunk=$1 + +cd /hive/data/genomes/perManBai1/bed/windowMasker/splitFa/${chunk} +time (doWindowMasker.pl -buildDir=`pwd` -unmaskedSeq=`pwd`/${chunk}.2bit \ + -stop=twobit -workhorse=hgwdev perManBai1) > do.log 2>&1 +' > runWM.sh + chmod +x runWM.sh + + printf '#LOOP +runWM.sh $(path1) +#ENDLOOP +' > wm.template + + ls -d chunk[0-9] chunk1[0-9] > sm.list + gensub2 wm.list single wm.template wm.jobList + para create jobList + para push +# Completed: 16 of 16 jobs +# CPU time in finished jobs: 3s 0.04m 0.00h 0.00d 0.000 y +# IO & Wait Time: 173439s 2890.66m 48.18h 2.01d 0.005 y +# Average job time: 10840s 180.67m 3.01h 0.13d +# Longest finished job: 13596s 226.60m 3.78h 0.16d +# Submission to last job: 13601s 226.68m 3.78h 0.16d + + # all results together: + cat chunk*/*.sdust.bed > /dev/shm/perManBai1.sdust.bed + $HOME/bin/x86_64/gnusort -S100G --parallel=32 -k1,1 -k2,2n \ + /dev/shm/perManBai1.sdust.bed > perManBai1.windowmasker.sdust.bed + + # temporary fixed twoBitMask + /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask \ + /hive/data/genomes/perManBai1/perManBai1.unmasked.2bit \ + perManBai1.windowmasker.sdust.bed perManBai1.wmsk.sdust.2bit + twoBitToFa perManBai1.wmsk.sdust.2bit stdout | faSize stdin \ + > faSize.perManBai1.wmsk.sdust.txt 2>&1 + + # Masking statistics + cat faSize.perManBai1.cleanWMSdust.txt +# 32393605577 bases (4026911109 N's 28366694468 real 18214135230 +# upper 10152559238 lower) in 125724 sequences in 1 files +# Total size: mean 257656.5 sd 973486.9 min 1033 (PGSH01113832v1) +# max 21669615 (PGSH01077959v1) median 47471 +# %31.34 masked total, %35.79 masked real + + hgLoadBed perManBai1 windowmaskerSdust perManBai1.windowmasker.sdust.bed + featureBits -countGaps perManBai1 windowmaskerSdust \ + > fb.perManBai1.windowmaskerSdust.beforeClean.txt 2>&1 + featureBits perManBai1 -not gap -bed=notGap.bed + featureBits perManBai1 windowmaskerSdust notGap.bed -bed=stdout \ + | gzip -c > cleanWMask.bed.gz + hgLoadBed perManBai1 windowmaskerSdust cleanWMask.bed.gz + featureBits -countGaps perManBai1 windowmaskerSdust \ + > fb.perManBai1.windowmaskerSdust.clean.txt 2>&1 + + zcat cleanWMask.bed.gz \ + | /cluster/home/hiram/kent/src/hg/utils/twoBitMask/twoBitMask \ + /hive/data/genomes/perManBai1/perManBai1.unmasked.2bit stdin \ + -type=.bed perManBai1.cleanWMSdust.2bit + + twoBitToFa perManBai1.cleanWMSdust.2bit stdout \ + | faSize stdin > faSize.perManBai1.cleanWMSdust.txt 2>&1 + featureBits -countGaps perManBai1 rmsk windowmaskerSdust \ + > fb.perManBai1.rmsk.windowmaskerSdust.txt 2>&1 + + cat fb.perManBai1.windowmaskerSdust.clean.txt + # 10152559238 bases of 32393605577 (31.341%) in intersection + + # rmsk is so sparse, it doesn't even overlap any of this + cat fb.perManBai1.rmsk.windowmaskerSdust.txt + # 0 bases of 32393605577 (0.000%) in intersection + +########################################################################## +# cpgIslands - (TBD - 2018-07-08 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/cpgIslands + cd /hive/data/genomes/perManBai1/bed/cpgIslands + time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ + -workhorse=hgwdev -smallClusterHub=ku perManBai1) > do.log 2>&1 + # real 100m4.369s + + cat fb.perManBai1.cpgIslandExt.txt + # 352400331 bases of 28366694468 (1.242%) in intersection + +############################################################################## +# ncbiRefSeq gene track (TBD - 2018-03-15 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/ncbiRefSeq + cd /hive/data/genomes/perManBai1/bed/ncbiRefSeq + + time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl \ + -buildDir=`pwd` -bigClusterHub=ku \ + -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \ + refseq vertebrate_mammalian Neomonachus_schauinslandi \ + GCF_002201575.1_ASM220157v1 perManBai1) > do.log 2>&1 + # real 4m37.395s + + cat fb.ncbiRefSeq.perManBai1.txt + # 45781082 bases of 2400839308 (1.907%) in intersection + +############################################################################## +# genscan - (TBD - 2018-07-08 - Hiram) + mkdir /hive/data/genomes/perManBai1/bed/genscan + cd /hive/data/genomes/perManBai1/bed/genscan + time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ + -bigClusterHub=ku perManBai1) > do.log 2>&1 + # fifty-six were broken at window size of 2400000, + # they all worked at 2000000 + # real 162m46.438s + + # continuing: + time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ + -continue=makeBed -bigClusterHub=ku perManBai1) > makeBed.log 2>&1 + # real 75m28.038s + + cat fb.perManBai1.genscan.txt + # 896393757 bases of 28366694468 (3.160%) in intersection + + cat fb.perManBai1.genscanSubopt.txt + # 978384685 bases of 28366694468 (3.449%) in intersection + +############################################################################# +# augustus gene track (TBD - 2018-07-08 - Hiram) + + mkdir /hive/data/genomes/perManBai1/bed/augustus + cd /hive/data/genomes/perManBai1/bed/augustus + time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ + -species=zebrafish -dbHost=hgwdev \ + -workhorse=hgwdev perManBai1) > do.log 2>&1 + # real 443m58.621s + + # continuing after removing some broken items: + time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ + -species=zebrafish -dbHost=hgwdev \ + -continue=load -workhorse=hgwdev perManBai1) > load.log 2>&1 + + cat fb.perManBai1.augustusGene.txt + # 828245083 bases of 28366694468 (2.920%) in intersection + +############################################################################# +# Create kluster run files (TBD - 2018-07-10 - Hiram) + + # obtain 'real' numbers of this assembly: + cd /hive/data/genomes/perManBai1 + head -1 faSize.perManBai1.2bit.txt +# 32393605577 bases (4026911109 N's 28366694468 real 18209209746 upper +# 10157484722 lower) in 125724 sequences in 1 files + + # numerator is perManBai1 gapless bases "real" + + # denominator is hg19 gapless bases as reported by: + # featureBits -noRandom -noHap hg19 gap + # 234344806 bases of 2861349177 (8.190%) in intersection + # 1024 is threshold used for human -repMatch: + calc \( 28366694468 / 2861349177 \) \* 1024 + # ( 28366694468 / 2861349177 ) * 1024 = 10151.677876 + + # tried repMatch=10000, but it didn't make many + # ==> use -repMatch=10000 according to size scaled down from 1024 for human. + # and rounded down to nearest 100 + cd /hive/data/genomes/perManBai1 + time blat perManBai1.2bit \ + /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/perManBai1.11.ooc \ + -repMatch=10000 + # Wrote 6435 overused 11-mers to jkStuff/perManBai1.11.ooc + # real 9m44.626s + + # at repMatch=1000 it makes many more: + # Wrote 1885128 overused 11-mers to jkStuff/perManBai1.11.ooc + + # at repMatch=2000 + # real 9m18.200s + # Wrote 740741 overused 11-mers to jkStuff/perManBai1.11.ooc + + # at repMatch=4000 + # Wrote 125201 overused 11-mers to jkStuff/perManBai1.11.ooc + # real 8m38.816s + + # going to use 3000 since this makes about 10X more than in human + # at repMatch=3000 + # Wrote 286064 overused 11-mers to jkStuff/perManBai1.11.ooc + # real 9m7.973s + + # there are no non-bridged gaps since these are all fake gaps, + # However, there are near 100,000 gaps over 10,000 bases, + # going to use those gaps as 'non-bridged' gaps + # Size survey: + hgsql -N -e 'select size from gap;' perManBai1 | ave stdin +# Q1 1701.000000 +# median 3331.000000 +# Q3 6422.000000 +# average 5254.430036 +# min 1.000000 +# max 289789.000000 +# count 766384 +# total 4026911109.000000 + + # check non-bridged gaps to see what the typical size is: +# hgsql -N \ +# -e 'select * from gap where bridge="no" order by size;' perManBai1 \ +# | sort -k7,7nr + # minimum size is 10000, added allowBridged option to allow this + ~/kent/src/hg/utils/gapToLift/gapToLift -allowBridged -verbose=2 \ + -minGap=10000 perManBai1 \ + jkStuff/perManBai1.10Kgaps.lft -bedFile=jkStuff/perManBai1.10Kgaps.bed + +# gapToLift -verbose=2 -minGap=50000 perManBai1 \ +# jkStuff/perManBai1.nonBridged.lft -bedFile=jkStuff/perManBai1.nonBridged.bed + +######################################################################### +# lastz/chain/net swap from hg38 (TBD - 2018-08-09 - Hiram) + + # alignment to hg38: + cd /hive/data/genomes/hg38/bed/lastzPerManBai1.2018-07-09 + cat fb.hg38.chainPerManBai1Link.txt + # 54520910 bases of 3049335806 (1.788%) in intersection + + cat fb.hg38.chainSynPerManBai1Link.txt + # 3343407 bases of 3049335806 (0.110%) in intersection + + cat fb.hg38.chainRBest.PerManBai1.txt + # 37383183 bases of 3049335806 (1.226%) in intersection + + # and for the swap: + mkdir /hive/data/genomes/perManBai1/bed/blastz.hg38.swap + cd /hive/data/genomes/perManBai1/bed/blastz.hg38.swap + + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/hg38/bed/lastzPerManBai1.2018-07-09/DEF \ + -swap -chainMinScore=5000 -chainLinearGap=loose \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 & + # real 105m38.443s + + cat fb.perManBai1.chainHg38Link.txt + # 59846443 bases of 28366694468 (0.211%) in intersection + + cat fb.perManBai1.chainSynHg38Link.txt + # 3456707 bases of 28366694468 (0.012%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` perManBai1 hg38) > rbest.log 2>&1 & + # real 555m51.873s + + cat fb.perManBai1.chainRBest.Hg38.txt + # 38573370 bases of 28366694468 (0.136%) in intersection + +############################################################################# +# lastz/chain/net swap from mm10 (TBD - 2018-06-10 - Hiram) + + # alignment to mm10 + cd /hive/data/genomes/mm10/bed/lastzPerManBai1.2018-07-09 + cat fb.mm10.chainPerManBai1Link.txt + # 52143617 bases of 2652783500 (1.966%) in intersection + + cat fb.mm10.chainSynPerManBai1Link.txt + # 2686570 bases of 2652783500 (0.101%) in intersection + + cat fb.mm10.chainRBest.PerManBai1.txt + # 36938030 bases of 2652783500 (1.392%) in intersection + + # and for the swap: + mkdir /hive/data/genomes/perManBai1/bed/blastz.mm10.swap + cd /hive/data/genomes/perManBai1/bed/blastz.mm10.swap + + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/mm10/bed/lastzPerManBai1.2018-07-09/DEF \ + -swap -chainMinScore=5000 -chainLinearGap=loose \ + -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ + -syntenicNet) > swap.log 2>&1 & + # real 39m28.757s + + cat fb.perManBai1.chainMm10Link.txt + # 87124587 bases of 28366694468 (0.307%) in intersection + + cat fb.perManBai1.chainSynMm10Link.txt + # 2893381 bases of 28366694468 (0.010%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` perManBai1 mm10) > rbest.log 2>&1 & + # real 568m10.621s + + # something odd went haywire at the download step + time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` perManBai1 mm10) > download.log 2>&1 & + # real 3m16.404s + + cat fb.perManBai1.chainRBest.Mm10.txt + # 38584422 bases of 28366694468 (0.136%) in intersection + +############################################################################## +# GENBANK AUTO UPDATE (TBD - 2018-07-10 - Hiram) + ssh hgwdev + cd $HOME/kent/src/hg/makeDb/genbank + git pull + # /cluster/data/genbank/data/organism.lst shows: + # organism mrnaCnt estCnt refSeqCnt + # Ambystoma mexicanum 7705 43326 0 + + # edit etc/genbank.conf to add perManBai1 just before nanPar1 + +# perManBai1 (Axolotl - Ambystoma mexicanum) 125724 scaffolds N50 3052786 - 30Gb total +perManBai1.serverGenome = /hive/data/genomes/perManBai1/perManBai1.2bit +perManBai1.clusterGenome = /hive/data/genomes/perManBai1/perManBai1.2bit +perManBai1.ooc = /hive/data/genomes/perManBai1/jkStuff/perManBai1.11.ooc +perManBai1.lift = /hive/data/genomes/perManBai1/jkStuff/perManBai1.10Kgaps.lft +perManBai1.perChromTables = no +perManBai1.downloadDir = perManBai1 +perManBai1.refseq.mrna.xeno.pslCDnaFilter = ${lowCover.refseq.mrna.xeno.pslCDnaFilter} +perManBai1.refseq.mrna.native.pslCDnaFilter = ${lowCover.refseq.mrna.native.pslCDnaFilter} +perManBai1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter} +perManBai1.genbank.mrna.xeno.pslCDnaFilter = ${lowCover.genbank.mrna.xeno.pslCDnaFilter} +perManBai1.genbank.est.native.pslCDnaFilter = ${lowCover.genbank.est.native.pslCDnaFilter} +perManBai1.genbank.est.xeno.pslCDnaFilter = ${lowCover.genbank.est.xeno.pslCDnaFilter} +# defaults yes: genbank.mrna.native.load genbank.mrna.native.loadDesc +# yes: genbank.est.native.load refseq.mrna.native.load +# yes: refseq.mrna.native.loadDesc refseq.mrna.xeno.load +# yes: refseq.mrna.xeno.loadDesc +# defaults no: genbank.mrna.xeno.load genbank.mrna.xeno.loadDesc +# no: genbank.est.native.loadDesc genbank.est.xeno.load +# no: genbank.est.xeno.loadDesc +# DO NOT NEED genbank.mrna.xeno except for human, mouse +# perManBai1.upstreamGeneTbl = ensGene +# perManBai1.upstreamMaf = multiz6way /hive/data/genomes/perManBai1/bed/multiz6way/species.list + +# And edit src/lib/gbGenome.c to add new species +# static char *ambMexNames[] = {"Ambystoma mexicanum", NULL}; + + git commit -m 'adding perManBai1 Axolotl - Ambystoma mexicanum' src/lib/gbGenome.c etc/genbank.conf + git push + + # verify stated file paths do exist: + grep perManBai1 etc/genbank.conf | egrep "Genome|ooc|lift" \ + | awk '{print $NF}' | grep -v -w no | xargs ls -og +-rw-rw-r-- 1 9104343295 Jul 8 19:32 /hive/data/genomes/perManBai1/perManBai1.2bit +-rw-rw-r-- 1 11166819 Jul 10 10:45 /hive/data/genomes/perManBai1/jkStuff/perManBai1.10Kgaps.lft +-rw-rw-r-- 1 1144264 Jul 10 10:00 /hive/data/genomes/perManBai1/jkStuff/perManBai1.11.ooc + + # update /cluster/data/genbank/: + make etc-update + make install-server + + # add perManBai1 to: + # etc/align.dbs etc/hgwdev.dbs + git commit -m 'adding perManBai1/axolotl refs #21715' \ + etc/align.dbs etc/hgwdev.dbs + git push + # update /cluster/data/genbank/: + make etc-update + + # XXX a few days later the genbank tables will be in the database + +############################################################################## +# BLATSERVERS ENTRY (TBD - 2017-10-06 - Hiram) +# After getting a blat server assigned by the Blat Server Gods, + ssh hgwdev + + hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("perManBai1", "blat1c", "17880", "1", "0"); \ + INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ + VALUES ("perManBai1", "blat1c", "17881", "0", "1");' \ + hgcentraltest + # test it with some sequence + +############################################################################## +# set default position to the PAX3 gene (TBD - 2018-07-11 - Hiram) +# as found via UCSC 'Other refseq' track + + hgsql -e \ +'update dbDb set defaultPos="PGSH01084259v1:1812985-2038932" where name="perManBai1";' \ + hgcentraltest + +############################################################################## +# all.joiner update, downloads and in pushQ - (TBD - 2018-07-13 - Hiram) + cd $HOME/kent/src/hg/makeDb/schema + + ~/kent/src/hg/utils/automation/verifyBrowser.pl perManBai1 +# 40 tables in database perManBai1 - Axolotl, Ambystoma mexicanum +# verified 40 tables in database perManBai1, 0 extra tables, 12 optional tables +# chainNetRBestHg38 3 optional tables +# chainNetRBestMm10 3 optional tables +# chainNetSynHg38 3 optional tables +# chainNetSynMm10 3 optional tables +# ERROR: no genbank tables found +# verified 28 required tables, 1 missing tables +# 1 ucscToRefSeq - missing table +# hg38 chainNet to perManBai1 found 3 required tables +# mm10 chainNet to perManBai1 found 3 required tables +# hg38 chainNet RBest and syntenic to perManBai1 found 6 optional tables +# mm10 chainNet RBest and syntenic to perManBai1 found 3 optional tables +# ERROR: blat server not found in hgcentraltest.blatServers ? + + # fixup all.joiner until this is a clean output + joinerCheck -database=perManBai1 -tableCoverage all.joiner + joinerCheck -database=perManBai1 -times all.joiner + joinerCheck -database=perManBai1 -keys all.joiner + + # need to specify workhorse so it will not overload ku + # and this procedure fails on ku due to limited memory for the + # maskOutFa step + cd /hive/data/genomes/perManBai1 + time (makeDownloads.pl -workhorse=hgwdev perManBai1) > downloads.log 2>&1 + # real 117m17.756s + + # continue after broken 'compress' step manually completed: + time (makeDownloads.pl -continue=install -workhorse=hgwdev perManBai1) \ + > downloadsInstall.log 2>&1 + # real 0m4.786s + + # now ready for pushQ entry + mkdir /hive/data/genomes/perManBai1/pushQ + cd /hive/data/genomes/perManBai1/pushQ + time (makePushQSql.pl -redmineList perManBai1) \ + > perManBai1.pushQ.sql 2> stderr.out + # real 4m11.149s + + # remove the tandemDups and gapOverlap from the listings: + sed -i -e "/tandemDups/d" redmine.perManBai1.table.list + sed -i -e "/Tandem Dups/d" redmine.perManBai1.releaseLog.txt + sed -i -e "/gapOverlap/d" redmine.perManBai1.table.list + sed -i -e "/Gap Overlaps/d" redmine.perManBai1.releaseLog.txt + + # check for errors in stderr.out, some are OK, e.g.: + # writing redmine listings to + # redmine.perManBai1.file.list + # redmine.perManBai1.table.list + # redmine.perManBai1.releaseLog.txt + # WARNING: perManBai1 does not have seq + # WARNING: perManBai1 does not have extFile + + # verify the file listings are valid, should be no output to stderr: + cat redmine.perManBai1.file.list \ + | while read L; do ls -ogL $L; done > /dev/null + + # to verify the database.table list is correct, should be the same + # line count for these two commands: + wc -l redmine.perManBai1.table.list + # 41 redmine.perManBai1.table.list + awk -F'.' '{ +printf "hgsql -N -e \"show table status like '"'"'%s'"'"';\" %s\n", $2, $1 +}' redmine.perManBai1.table.list | while read L; do eval $L; done | wc -l + # 41 + +# add the path names to the listing files in the redmine issue + # in the three appropriate entry boxes: + ls `pwd`/redmine* + +/hive/data/genomes/perManBai1/pushQ/redmine.perManBai1.file.list +/hive/data/genomes/perManBai1/pushQ/redmine.perManBai1.releaseLog.txt +/hive/data/genomes/perManBai1/pushQ/redmine.perManBai1.table.list + +#########################################################################