e55f16028cef4f090371dafece16faf736596822 hiram Mon Apr 26 14:53:09 2021 -0700 fixup correct ports for blat server refs #26658 diff --git src/hg/makeDb/doc/staAur2/initialBuild.txt src/hg/makeDb/doc/staAur2/initialBuild.txt index e160777..a3d65ce 100644 --- src/hg/makeDb/doc/staAur2/initialBuild.txt +++ src/hg/makeDb/doc/staAur2/initialBuild.txt @@ -1,730 +1,730 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the staAur2 ######################################################################### # reuse photo from staAur1 browser (DONE - 2018-12-13 - Hiram) mkdir -p /hive/data/genomes/staAur2 cd /hive/data/genomes/staAur2 cp -p ../staAur1/photoReference.txt . cat photoReference.txt photoCreditURL https://phil.cdc.gov/phil/details.asp?pid=11157 photoCreditName Centers for Disease Control and Prevention's Public Health Image Library ######################################################################### # Initial steps (DONE - 2018-12-13 - Hiram) # This initialBuild.txt document was started from hpv1 # version of initialBuild.txt sed -e 's/Aur1/Aur2/g; s/DONE/TBD/g;' \ ../staAur1/initialBuild.txt > initialBuild.txt mkdir /hive/data/genomes/staAur2/refseq cd /hive/data/genomes/staAur2/refseq # accession name: GCF_000013465.1_ASM1346v1 # ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/013/465/GCF_000013465.1_ASM1346v1 export accession="GCF_000013465.1" export asmId="ASM1346v1" export level0="GCF" export level1="000" export level2="013" export level3="465" time rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/all/$level0/$level1/$level2/$level3/${accession}_${asmId}/ ./ # sent 335 bytes received 5,919,626 bytes 3,946,640.67 bytes/sec # total size is 5,916,881 speedup is 1.00 # real 0m2.911s # check assembly size for later reference: faSize GCF*v1_genomic.fna.gz # 2917469 bases (0 N's 2917469 real 2917469 upper 0 lower) in 4 sequences in 1 files # this information is from the top of # staAur2/refseq/GCF_000013465.1_ASM1346v1_assembly_report.txt # Assembly name: ASM1346v1 # Organism name: Staphylococcus aureus subsp. aureus USA300_FPR3757 (firmicutes) # Infraspecific name: strain=USA300_FPR3757 # Taxid: 451515 # BioSample: SAMN02604150 # BioProject: PRJNA16313 # Submitter: University of California, San Francisco # Date: 2006-2-10 # Assembly type: n/a # Release type: major # Assembly level: Complete Genome # Genome representation: full # GenBank assembly accession: GCA_000013465.1 # RefSeq assembly accession: GCF_000013465.1 # RefSeq assembly and GenBank assemblies identical: yes # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000013475.1 GCF_000013475.1 Primary Assembly ############################################################################# # establish config.ra file (TBD - Hiram - 2017-08-04) # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt> cd /hive/data/genomes/staAur2 $HOME/kent/src/hg/utils/automation/prepConfig.pl staAur2 bacteria \ bacteria ./refseq/*_assembly_report.txt > staAur2.config.ra # going to need a mitoAcc ? # this clade 'bacteria' does not yet exist in hgcentraltest.clade hgsql hgcentraltest -e 'INSERT INTO clade (name, label, priority) VALUES ("bacteria", "Bacteria", 1500)' # fixup the genomeCladePriority to 1500 # fixup scientificName from: # scientificName Staphylococcus aureus subsp. aureus USA300_FPR3757 # to: # scientificName Staphylococcus aureus # fixup commonName from commonName Firmicutes to: Staph # set mitoAcc none # reset order key 6452 to: 19764 # to see order keys to verify this one is correct: # hgsql -e 'select name,organism,orderKey from dbDb order by orderKey;' \ # hgcentraltest | less # saiBol1 Squirrel monkey 19725 # staAur2 Staph 19764 # staAur1 Staph 19765 # conCri1 Star-nosed mole 19805 # verify it looks sane cat staAur2.config.ra db staAur2 clade bacteria genomeCladePriority 1500 scientificName Staphylococcus aureus commonName Staph assemblyDate Feb. 2006 assemblyLabel University of California, San Francisco assemblyShortLabel ASM1346v1 orderKey 19764 mitoAcc none fastaFiles /hive/data/genomes/staAur2/ucsc/*.fa.gz agpFiles /hive/data/genomes/staAur2/ucsc/*.agp # qualFiles none dbDbSpeciesDir bacteria photoCreditURL https://phil.cdc.gov/phil/details.asp?pid=11157 photoCreditName Centers for Disease Control and Prevention's Public Health Image Library ncbiGenomeId 154 ncbiAssemblyId 34568 ncbiAssemblyName ASM1346v1 ncbiBioProject 16313 ncbiBioSample SAMN02604150 genBankAccessionID GCF_000013465.1 taxId 451515 ############################################################################# # setup UCSC named files (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/ucsc cd /hive/data/genomes/staAur2/ucsc faCount ../refseq/GCF_000013465.1_ASM1346v1_genomic.fna.gz # #seq len A C G T N cpg # NC_007793.1 2872769 960377 470674 470186 971532 0 72505 # NC_007790.1 3125 1088 379 519 1139 0 60 # NC_007791.1 4439 1651 541 791 1456 0 78 # NC_007792.1 37136 14962 4767 5879 11528 0 594 # total 2917469 978078 476361 477375 985655 0 73237 # one simple sequence: zcat ../refseq/GCF_000013465.1_ASM1346v1_genomic.fna.gz \ | sed -e 's/^>NC_007790.1.*/>NC_007790v1/; s/^>NC_007791.1.*/>NC_007791v1/; s/^>NC_007792.1.*/>NC_007792v1/; s/^>NC_007793.1.*/>NC_007793v1/;' \ | faSplit sequence stdin 4 chr gzip chr*.fa zcat *.fa.gz | hgFakeAgp -singleContigs stdin stdout \ | sed -e 's/D/F/;' > staAur2.agp # verify OK: zcat *.fa.gz | checkAgpAndFa staAur2.agp stdin | tail -2 # Valid Fasta file entry # All AGP and FASTA entries agree - both files are valid ############################################################################# # Initial database build (DONE - 2018-12-13 - Hiram) cd /hive/data/genomes/staAur2 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp staAur2.config.ra) > agp.log 2>&1 # real 0m9.508s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db staAur2.config.ra) > db.log 2>&1 # real 0m18.173s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add staAur2 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/staAur2 ln -s `pwd`/staAur2.unmasked.2bit /gbdb/staAur2/staAur2.2bit ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/cpgIslandsUnmasked cd /hive/data/genomes/staAur2/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/staAur2/staAur2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku staAur2) > do.log 2>&1 XXX - running - Thu Dec 13 14:47:15 PST 2018 # real 0m37.441s cat fb.staAur2.cpgIslandExtUnmasked.txt # 17793 bases of 2821361 (0.631%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/cytoBand cd /hive/data/genomes/staAur2/bed/cytoBand makeCytoBandIdeo.csh staAur2 ######################################################################### # ucscToINSDC and ucscToRefSeq table/track (DONE - 2018-12-13 - Hiram) # really simple situation here, only four names to deal with # and they are specified in the assembly_report.txt file mkdir /hive/data/genomes/staAur2/bed/ucscToINSDC cd /hive/data/genomes/staAur2/bed/ucscToINSDC grep -v "^#" ../../refseq/GCF_000013465.1_ASM1346v1_assembly_report.txt \ | cut -f7,9 | awk '{printf "%s\t0\t%d\t%s\n", $1, $2,$1}' \ | sed -e 's/\./v/;' > ucscToRefSeq.bed grep -v "^#" ../../refseq/GCF_000013465.1_ASM1346v1_assembly_report.txt \ | cut -f5,7,9 | awk '{printf "%s\t0\t%d\t%s\n", $2,$3,$1}' \ | sed -e 's/\./v/;' > ucscToINSDC.bed export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 11 # use the chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab staAur2 ucscToINSDC stdin ucscToINSDC.bed # should be the same for ucscToRefSeq: export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 11 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab staAur2 ucscToRefSeq stdin ucscToRefSeq.bed # checkTableCoords should be silent checkTableCoords staAur2 # each should cover %100 entirely: featureBits -countGaps staAur2 ucscToINSDC # 2917469 bases of 2917469 (100.000%) in intersection featureBits -countGaps staAur2 ucscToRefSeq # 2917469 bases of 2917469 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/chromAlias cd /hive/data/genomes/staAur2/bed/chromAlias hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' staAur2 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' staAur2 \ > ucsc.genbank.tab grep -v "^#" ../../refseq/GCF_000013465.1_ASM1346v1_assembly_report.txt \ | cut -f1,3,7 | awk '{printf "%s\t%s\tucsc\n", $3,$2}' \ | sed -e 's/\./v/;' | sed -e 's/na/chr1/;' > ucsc.ucsc.tab awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \ ucsc.ucsc.tab | sort > staAur2.chromAlias.tab cat staAur2.chromAlias.tab | sed -e 's/^/# /;' # CP000255.1 NC_007793v1 genbank # CP000256.1 NC_007790v1 genbank # CP000257.1 NC_007791v1 genbank # CP000258.1 NC_007792v1 genbank # NC_007790.1 NC_007790v1 refseq # NC_007791.1 NC_007791v1 refseq # NC_007792.1 NC_007792v1 refseq # NC_007793.1 NC_007793v1 refseq # chr1 NC_007793v1 ucsc # pUSA01 NC_007790v1 ucsc # pUSA02 NC_007791v1 ucsc # pUSA03 NC_007792v1 ucsc hgLoadSqlTab staAur2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ staAur2.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2018-12-13 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/bacteria/staAur2 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" staAur2 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # 1 NC_v1 # implies a rule: 'NC_[0-9]+(v[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" staAur2 | wc -l # 4 hgsql -N -e "select frag from gold;" staAur2 \ | egrep -e 'NC_[0-9]+(v[0-9]+)?' | wc -l # 4 hgsql -N -e "select frag from gold;" staAur2 \ | egrep -v -e 'NC_[0-9]+(v[0-9]+)?' | wc -l # 0 # hence, add to trackDb/bacteria/staAur2/trackDb.ra searchTable gold shortCircuit 1 termRegex NC_[0-9]+(v[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (TBD - 2017-08-04 - Hiram) # RepeatMasker doesn't know about this virus sequence name as is, # scientificName Staphylococcus aureus # found this in taxonomy.dat table in RM: # 'low g+c gram-positive bacteria' name, taxId 1239 # from: /hive/data/staging/data/RepeatMasker140131/Libraries/taxonomy.dat mkdir /hive/data/genomes/staAur2/bed/repeatMasker cd /hive/data/genomes/staAur2/bed/repeatMasker # fixing the script to work with new location of RepeatMasker # no longer on /scratch/data/ and picked up a new version. time (~/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ -species "low g+c gram-positive bacteria" -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku staAur2) > do.log 2>&1 & # Completed: 7 of 7 jobs # CPU time in finished jobs: 400s 6.67m 0.11h 0.00d 0.000 y # IO & Wait Time: 15s 0.25m 0.00h 0.00d 0.000 y # Average job time: 59s 0.99m 0.02h 0.00d # Longest finished job: 67s 1.12m 0.02h 0.00d # Submission to last job: 74s 1.23m 0.02h 0.00d # real 1m34.367s time (~/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ -species "low g+c gram-positive bacteria" -bigClusterHub=ku \ -continue=cat -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku staAur2) > cat.log 2>&1 & # real 0m14.490s # fails with an empty nestedRepeats file during load # help the doLoad step finish: ln -s `pwd`/staAur2.rmsk.2bit /hive/data/genomes/staAur2/staAur2.rmsk.2bit # continue with cleanUp: time (~/kent/src/hg/utils/automation/doRepeatMasker.pl -buildDir=`pwd` \ -species "low g+c gram-positive bacteria" -bigClusterHub=ku \ -continue=cleanup -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku staAur2) > cleanup.log 2>&1 & # real 0m4.545s XXX - need to straighten this out egrep -i "versi|relea" do.log RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; * cat faSize.rmsk.txt # 2917469 bases (0 N's 2917469 real 2903502 upper 13967 lower) in 4 sequences in 1 files # Total size: mean 729367.2 sd 1429021.1 min 3125 (NC_007790v1) max 2872769 (NC_007793v1) median 37136 # %0.48 masked total, %0.48 masked real time featureBits -countGaps staAur2 rmsk # 13967 bases of 2917469 (0.479%) in intersection # real 0m0.204s ########################################################################## # running simple repeat (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/simpleRepeat cd /hive/data/genomes/staAur2/bed/simpleRepeat # using trf409 1 here (human == 6) time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 1 staAur2) > do.log 2>&1 & # real 0m23.322s cat fb.simpleRepeat # 29041 bases of 2917469 (0.995%) in intersection ######################################################################### # CREATE MICROSAT TRACK (DONE - 2018-12-13 - Hiram) # XXX - this makes an empty microsat.bed file, nothing to load ssh hgwdev mkdir /cluster/data/staAur2/bed/microsat cd /cluster/data/staAur2/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed staAur2 microsat microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2018-12-13 - Hiram) mkdir /hive/data/genomes/staAur2/bed/windowMasker cd /hive/data/genomes/staAur2/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev staAur2) > do.log 2>&1 # real 0m28.122s cat faSize.staAur2.cleanWMSdust.txt # 2917469 bases (0 N's 2917469 real 2450039 upper 467430 lower) # in 4 sequences in 1 files # Total size: mean 729367.2 sd 1429021.1 min 3125 (NC_007790v1) # max 2872769 (NC_007793v1) median 37136 # %16.02 masked total, %16.02 masked real cat fb.staAur2.rmsk.windowmaskerSdust.txt # 8293 bases of 2821361 (0.294%) in intersection ########################################################################## # masking 2bit file (TBD - 2017-08-04 - Hiram) cd /hive/data/genomes/staAur2 twoBitMask staAur2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed staAur2.rmsk.trf.2bit twoBitMask staAur2.rmsk.trf.2bit -type=.bed \ -add bed/windowMasker/cleanWMask.bed.gz staAur2.2bit twoBitToFa staAur2.2bit stdout | faSize stdin > faSize.staAur2.2bit.txt cat faSize.staAur2.2bit.txt # 2821361 bases (1 N's 2821360 real 2359001 upper 462359 lower) in 1 sequences in 1 files # %16.39 masked total, %16.39 masked real # reset the symlink rm /gbdb/staAur2/staAur2.2bit ln -s `pwd`/staAur2.2bit /gbdb/staAur2/staAur2.2bit ########################################################################## # run up idKeys files for ncbiRefSeq (TBD - 2017-08-04 - Hiram) mkdir /hive/data/genomes/staAur2/bed/idKeys cd /hive/data/genomes/staAur2/bed/idKeys time (doIdKeys.pl -buildDir=`pwd` staAur2) > do.log 2>&1 & # real 0m17.167s cat staAur2.keySignature.txt # 3d70b1f5bdeec2114c63b7ce2017ea96 ########################################################################## # cpgIslands - (TBD - 2017-08-04 - Hiram) mkdir /hive/data/genomes/staAur2/bed/cpgIslands cd /hive/data/genomes/staAur2/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku staAur2) > do.log 2>&1 & # real 0m36.836s cat fb.staAur2.cpgIslandExt.txt # 16205 bases of 2821361 (0.574%) in intersection ######################################################################### # crispr 10K shoulders (DONE - 2018-10-16 - Hiram) # working on this script, adding the indexFa step: mkdir /hive/data/genomes/staAur2/bed/crispr10K cd /hive/data/genomes/staAur2/bed/crispr10K time (~/kent/src/hg/utils/automation/doCrispr.pl \ -stop=indexFa -buildDir=`pwd` -smallClusterHub=ku staAur2 ncbiGene) \ > indexFa.log 2>&1 # real 0m6.541s time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=ranges -stop=specScores -buildDir=`pwd` -smallClusterHub=ku \ staAur2 ncbiGene) > specScores.log 2>&1 # real 2m50.758s # adding the /dev/shm/ setup rsync for the indexed Fa # performed manually to work out the procedure time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=specScores -stop=specScores -buildDir=`pwd` \ -smallClusterHub=ku staAur2 ncbiGene) > specScores.log # Completed: 2048 of 2048 jobs # CPU time in finished jobs: 10750s 179.17m 2.99h 0.12d 0.000 y # IO & Wait Time: 2462s 41.03m 0.68h 0.03d 0.000 y # Average job time: 6s 0.11m 0.00h 0.00d # Longest finished job: 13s 0.22m 0.00h 0.00d # Submission to last job: 122s 2.03m 0.03h 0.00d time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=effScores -stop=load \ -buildDir=`pwd` -smallClusterHub=ku staAur2 ncbiGene) \ > load.log 2>&1 XXX - running - Thu Dec 13 16:01:30 PST 2018 # real 10m6.692s ######################################################################### ############################################################################## # genscan - (TBD - 2017-08-04 - Hiram) mkdir /hive/data/genomes/staAur2/bed/genscan cd /hive/data/genomes/staAur2/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku staAur2) > do.log 2>&1 & # real 2m7.070s cat fb.staAur2.genscan.txt # 395771 bases of 2821361 (14.028%) in intersection cat fb.staAur2.genscanSubopt.txt # 104700 bases of 2821361 (3.711%) in intersection ############################################################################# # augustus gene track (TBD - 2017-04-13 - Hiram) # XXX augustus can not do bacteria sequence ? mkdir /hive/data/genomes/staAur2/bed/augustus cd /hive/data/genomes/staAur2/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -workhorse=hgwdev staAur2) > do.log 2>&1 & # real 72m23.671s cat fb.staAur2.augustusGene.txt # 29811614 bases of 2318132242 (1.286%) in intersection ############################################################################## # Create kluster run files (TBD - 2017-08-04 - Hiram) # numerator is staAur2 gapless bases "real" as reported by: featureBits -noRandom -noHap staAur2 gap # 0 bases of 2821361 (0.000%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2821361 / 2861349177 \) \* 1024 # ( 2821361 / 2861349177 ) * 1024 = 1.009689 # repMatch=1 produces 410921 overused 11-mers # repMatch=2 produces 73051 overused 11-mers # repMatch=3 produces 18919 overused 11-mers # repMatch=3 produces 6121 overused 11-mers # repMatch=5 produces 2319 overused 11-mers # repMatch=6 produces 923 overused 11-mers # repMatch=7 produces 379 overused 11-mers # repMatch=8 produces 168 overused 11-mers # repMatch=9 produces 80 overused 11-mers # repMatch=10 produces 43 overused 11-mers # ... # repMatch=20 produces 0 overused 11-mers # ==> use -repMatch=20, do not need to mask anything cd /hive/data/genomes/staAur2 blat staAur2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/staAur2.11.ooc \ -repMatch=20 # Wrote 0 overused 11-mers to jkStuff/staAur2.11.ooc # check non-bridged gaps to see what the typical size is: # there are no gaps in this assembly: hgsql -N -e 'select bridge from gap;' staAur2 | sort | uniq -c # no output, nothing to see here ######################################################################### # GENBANK AUTO UPDATE (TBD - 2017-08-04 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # organism mrnaCnt estCnt refSeqCnt # Staphylococcus 2 0 0 # Staphylococcus aureus 185 0 0 # Staphylococcus aureus subsp. aureus NCTC 8325 32 0 0 # Staphylococcus aureus subsp. aureus RN4220 1 0 0 # Staphylococcus epidermidis 46 1255 0 # Staphylococcus lentus 1 0 0 # Staphylococcus lugdunensis 9 0 0 # Staphylococcus phage phi-42 1 0 0 # Staphylococcus pseudintermedius 9 0 0 # Staphylococcus sp. B2_30 1 0 0 # Staphylococcus sp. B2_43 1 0 0 # Staphylococcus sp. SH24 1 0 0 # Staphylococcus warneri 1 0 0 # add these four to src/lib/gbGenome.c for staAurNames[] # Staphylococcus 2 0 0 # Staphylococcus aureus 185 0 0 # Staphylococcus aureus subsp. aureus NCTC 8325 32 0 0 # Staphylococcus aureus subsp. aureus RN4220 1 0 0 # edit etc/genbank.conf to add staAur2 just after aplCal1 and before hbv1 # staAur2 (Staphylococcus aureus - Taxid: 93061) staAur2.serverGenome = /hive/data/genomes/staAur2/staAur2.2bit staAur2.clusterGenome = /hive/data/genomes/staAur2/staAur2.2bit staAur2.ooc = /hive/data/genomes/staAur2/jkStuff/staAur2.11.ooc staAur2.lift = no staAur2.downloadDir = staAur2 staAur2.perChromTables = no staAur2.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} staAur2.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} staAur2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} staAur2.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} staAur2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} # DO NOT NEED genbank.mrna.xeno except for human, mouse # defaults yes: genbank.mrna.native.load, genbank.mrna.native.loadDesc, # genbank.est.native.load, refseq.mrna.native.load, refseq.mrna.native.loadDesc, # refseq.mrna.xeno.load , refseq.mrna.xeno.loadDesc # staAur2.upstreamGeneTbl = ensGene # staAur2.upstreamMaf = multiz9way /hive/data/genomes/staAur2/bed/multiz9way/species.list git commit -m 'adding staAur2 Staphylococcus aureus- refs #19937' etc/genbank.conf src/lib/gbGenome.c git push make install-server make etc-update cd /cluster/data/genbank time ./bin/gbAlignStep -initial staAur2 # logFile: var/build/logs/2017.08.04-14:20:13.staAur2.initalign.log # real 296m21.433s tail -2 var/build/logs/2017.08.04-14:20:13.staAur2.initalign.log hgwdev 2017.08.04-19:16:28 staAur2.initalign: Succeeded: staAur2 hgwdev 2017.08.04-19:16:35 staAur2.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.staAur2 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad staAur2 # logFile: var/dbload/hgwdev/logs/2017.08.05-00:32:05.staAur2.dbload.log # real 19m45.284s # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add staAur2 to: # etc/align.dbs etc/hgwdev.dbs git commit -m 'adding staAur2 to the update alignments refs #19337' etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################# # ncbiGene (TBD - 2017-08-05 - Hiram) mkdir /hive/data/genomes/staAur2/bed/ncbiGene cd /hive/data/genomes/staAur2/bed/ncbiGene | sed -e 's/NC_007790.1/NC_007790v1/g; s/NC_007791.1/NC_007791v1/g; s/NC_007792.1/NC_007792v1/g; s/NC_007793.1/NC_007793v1/g;' \ # switching the names from column 1 to 12 and 12 to 1 with the awk: gff3ToGenePred -useName -attrsOut=staAur2.attrs.tab -geneNameAttr=gene \ ../../refseq/GCF_000013465.1_ASM1346v1_genomic.gff.gz \ stdout | sed -e 's/NC_007790.1/NC_007790v1/g; s/NC_007791.1/NC_007791v1/g; s/NC_007792.1/NC_007792v1/g; s/NC_007793.1/NC_007793v1/g;' \ | awk -F'\t' '{print $12,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$1,$13,$14,$15}' | tr '[ ]' '[\t]' > staAur2.ncbiGene.gp # rpmH NC_007795v1 - 2821009 2821147 2821009 2821147 1 2821009, # 2821147, 0 YP_501500.1 cmpl cmpl 0, # rnpA NC_007795v1 - 2820535 2820889 2820535 2820889 1 2820535, # 2820889, 0 YP_501499.1 cmpl cmpl 0, genePredCheck -db=staAur2 staAur2.ncbiGene.gp 2>&1 | sed -e 's/^/ # /;' # checked: 3060 failed: 0 hgLoadGenePred -genePredExt staAur2 ncbiGene staAur2.ncbiGene.gp genePredCheck -db=staAur2 ncbiGene 2>&1 | sed -e 's/^/ # /;' # checked: 3060 failed: 0 XXX - need to get gene descriptions out of the genbank record ######################################################################### # BLATSERVERS ENTRY (TBD - 2017-08-07 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("staAur2", "blat1a", "17876", "1", "0"); \ + VALUES ("staAur2", "blat1a", "17896", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("staAur2", "blat1a", "17877", "0", "1");' \ + VALUES ("staAur2", "blat1a", "17897", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## set default position to SRC gene sequence from human protein blat ## (TBD - 2017-04-19 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="NW_003613641v1:1277445-1295702" where name="staAur2";' hgcentraltest ######################################################################### # all.joiner update, downloads and in pushQ - (TBD - 2017-04-25 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=staAur2 -tableCoverage all.joiner joinerCheck -database=staAur2 -times all.joiner joinerCheck -database=staAur2 -keys all.joiner cd /hive/data/genomes/staAur2 # needed a symlink for RM output: cd NC_007795v1 ln -s ../bed/repeatMasker/staAur2.sorted.fa.out NC_007795v1.fa.out cd .. time (makeDownloads.pl -workhorse=hgwdev staAur2) > downloads.log 2>&1 # real 0m12.888s # now ready for pushQ entry mkdir /hive/data/genomes/staAur2/pushQ cd /hive/data/genomes/staAur2/pushQ time (makePushQSql.pl -redmineList staAur2) \ > staAur2.pushQ.sql 2> stderr.out # real 3m37.776s # check for errors in stderr.out, some are OK, e.g.: # writing redmine listings to # redmine.staAur2.file.list # redmine.staAur2.table.list # redmine.staAur2.releaseLog.txt # WARNING: staAur2 does not have augustusGene # WARNING: staAur2 does not have microsat # WARNING: staAur2 does not have nestedRepeats # WARNING: staAur2 does not have seq # WARNING: staAur2 does not have extFile # WARNING: staAur2 does not have estOrientInfo # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # ncbiGene # enter the path names to the redmine listings in the redmine issue # refs 19937 #########################################################################