21448602a1ff6c887e857f85ba4ee23da3c9ebb9 hiram Tue Oct 8 15:03:39 2019 -0700 catch up with a missing download file papAnu4.mm10.rbest.axt.gz no redmine diff --git src/hg/makeDb/doc/papAnu4/initialBuild.txt src/hg/makeDb/doc/papAnu4/initialBuild.txt index 2123516..2343eef 100644 --- src/hg/makeDb/doc/papAnu4/initialBuild.txt +++ src/hg/makeDb/doc/papAnu4/initialBuild.txt @@ -1,870 +1,882 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the papAnu4 ######################################################################### # reuse photograph obtained for dipOrd1 previous versions # (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4 cd /hive/data/genomes/papAnu4 cp -p ../papAnu3/photoReference.txt . cat photoReference.txt photoCreditURL http://www.oumedicine.com/pathology/general-program-info/faculty-staff/roman-f-wolf-dvm photoCreditName Roman Wolf, University of Oklahoma Health Sciences Center ######################################################################### # Initial steps (DONE - 2018-01-03 - Hiram) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/papAnu4 cd ~/kent/src/hg/makeDb/doc/papAnu4 # best to use a most recent document since it has the latest features and # procedures: sed -e 's/dipOrd2/papAnu4/g; s/DipOrd2/PapAnu4/g; s/DONE/TBD/g;' \ ../dipOrd2/initialBuild.txt > initialBuild.txt mkdir /hive/data/genomes/papAnu4/refseq cd /hive/data/genomes/papAnu4/refseq time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Papio_anubis/all_assembly_versions/GCF_000264685.3_Panu_3.0/ ./ # sent 2638 bytes received 4293834067 bytes 17706543.11 bytes/sec # total size is 4293300261 speedup is 1.00 # real 4m2.654s # check assembly size for later reference: faSize G*0_genomic.fna.gz # 2959373024 bases (22371497 N's 2937001527 real 1808319052 upper # 1128682475 lower) in 63235 sequences in 1 files # Total size: mean 46799.6 sd 2508983.7 min 200 (NW_018761261.1) # max 217458864 (NC_018152.2) median 1406 # %38.14 masked total, %38.43 masked real # this information is from the top of # papAnu4/refseq/GCF_000264685.3_Panu_3.0_assembly_report.txt # Assembly name: Panu_3.0 # Organism name: Papio anubis (olive baboon) # Isolate: 1X1155 # Sex: female # Taxid: 9555 # BioSample: SAMN02981400 # BioProject: PRJNA54005 # Submitter: Human Genome Sequencing Center # Date: 2017-4-20 # Assembly type: haploid # Release type: major # Assembly level: Chromosome # Genome representation: full # WGS project: AHZZ02 # Assembly method: CABOG v. 6.1; ATLAS-LINK v. 1.0; ATLAS-GAPFILL v. 2.0; PBJelly2 v. 14.9.9; Pilon v. 1.18 # Genome coverage: 104.0x # Sequencing technology: Sanger 3730; 454 FLX; Illumina; PacBio # RefSeq category: Representative Genome # GenBank assembly accession: GCA_000264685.2 # RefSeq assembly accession: GCF_000264685.3 # RefSeq assembly and GenBank assemblies identical: no # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000264695.2 GCF_000264695.2 Primary Assembly ## GCF_000749495.1 non-nuclear ############################################################################# # establish config.ra file (DONE - Hiram - 2018-01-03) # arguments here are: <db> <clade> <trackDbDir> <assembly_report.txt> cd /hive/data/genomes/papAnu4 $HOME/kent/src/hg/utils/automation/prepConfig.pl papAnu4 mammal \ baboon ./refseq/*_assembly_report.txt > papAnu4.config.ra # verify it looks sane cat papAnu4.config.ra # config parameters for makeGenomeDb.pl: db papAnu4 clade mammal genomeCladePriority 35 scientificName Papio anubis commonName Olive baboon assemblyDate Apr. 2017 assemblyLabel Human Genome Sequencing Center assemblyShortLabel Panu_3.0 orderKey 15311 # mitochondrial sequence included in refseq release # mitoAcc NC_020006.2 mitoAcc none fastaFiles /hive/data/genomes/papAnu4/ucsc/*.fa.gz agpFiles /hive/data/genomes/papAnu4/ucsc/*.agp # qualFiles none dbDbSpeciesDir baboon photoCreditURL http://www.oumedicine.com/pathology/general-program-info/faculty-staff/roman-f-wolf-dvm photoCreditName Roman Wolf, University of Oklahoma Health Sciences Center ncbiGenomeId 394 ncbiAssemblyId 1082401 ncbiAssemblyName Panu_3.0 ncbiBioProject 54005 ncbiBioSample SAMN02981400 genBankAccessionID GCF_000264685.3 taxId 9555 ############################################################################# # setup UCSC named files (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/ucsc cd /hive/data/genomes/papAnu4/ucsc # check for duplicate sequences: time faToTwoBit -noMask ../refseq/G*0_genomic.fna.gz refseq.2bit # real 1m15.460s twoBitDup refseq.2bit # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this refseq.2bit file time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../refseq/G*0_genomic.fna.gz \ ../refseq/G*0_assembly_structure/Primary_Assembly # real 16m56.032s # NC_018152.2 chr1 # NC_018153.2 chr2 # NC_018154.2 chr3 # NC_018155.2 chr4 # NC_018156.2 chr5 # NC_018157.2 chr6 # NC_018158.2 chr7 # NC_018159.2 chr8 # NC_018160.2 chr9 # NC_018161.2 chr10 # NC_018162.2 chr11 # NC_018163.2 chr12 # NC_018164.2 chr13 # NC_018165.2 chr14 # NC_018166.2 chr15 # NC_018167.2 chr16 # NC_018168.2 chr17 # NC_018169.2 chr18 # NC_018170.2 chr19 # NC_018171.2 chr20 # NC_018172.2 chrX time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # processed 63213 sequences into chrUn.fa.gz # real 23m26.203s # there are no unlocalized sequences # time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ # ../refseq/*_assembly_structure/Primary_Assembly # bash syntax here mitoAcc=`grep "^# mitoAcc" ../papAnu4.config.ra | awk '{print $NF}'` printf "# mitoAcc %s\n" "$mitoAcc" # mitoAcc NC_020006.2 zcat \ ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp printf ">chrM\n" > chrM.fa twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa gzip chrM.fa # verify fasta and AGPs agree time faToTwoBit chr*.fa.gz test.2bit # real 1m21.950s time cat chr*.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 # All AGP and FASTA entries agree - both files are valid # real 0m43.586s # and no sequence lost from orginal: twoBitToFa test.2bit stdout | faSize stdin # 2959373024 bases (22371497 N's 2937001527 real 2937001527 upper 0 lower) # in 63235 sequences in 1 files # same numbers as above # 2959373024 bases (22371497 N's 2937001527 real 1808319052 upper # 1128682475 lower) in 63235 sequences in 1 files # no longer need these temporary 2bit files rm refseq.2bit test.2bit ############################################################################# # Initial database build (DONE - 2018-01-03 - Hiram) cd /hive/data/genomes/papAnu4 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp papAnu4.config.ra) > agp.log 2>&1 # *** All done! (through the 'agp' step) # real 3m18.554s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db papAnu4.config.ra) > db.log 2>&1 # real 24m58.359s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add papAnu4 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/papAnu4 ln -s `pwd`/papAnu4.unmasked.2bit /gbdb/papAnu4/papAnu4.2bit # fixup common name so it is the same as the other papAnu browsers: hgsql -e 'update dbDb set organism="Baboon" where name = "papAnu4";' hgcentraltest hgsql -e 'update dbDb set genome="Baboon" where name = "papAnu4";' hgcentraltest hgsql -e 'delete from defaultDb where name="papAnu4";' hgcentraltest # reset the orderKey due to common name confusion: hgsql -e 'update dbDb set orderKey=2048 where name="papAnu4";' hgcentraltest ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/cpgIslandsUnmasked cd /hive/data/genomes/papAnu4/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/papAnu4/papAnu4.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku papAnu4) > do.log 2>&1 # real 9m9.275s cat fb.papAnu4.cpgIslandExtUnmasked.txt # 41864984 bases of 2937004939 (1.425%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/cytoBand cd /hive/data/genomes/papAnu4/bed/cytoBand makeCytoBandIdeo.csh papAnu4 ########################################################################## # run up idKeys files for chromAlias (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/idKeys cd /hive/data/genomes/papAnu4/bed/idKeys time (doIdKeys.pl -twoBit=/hive/data/genomes/papAnu4/papAnu4.unmasked.2bit -buildDir=`pwd` papAnu4) > do.log 2>&1 & # real 30m33.682s cat papAnu4.keySignature.txt # 0f840424e1e80f402d116d8b0e4ad946 ########################################################################## # ucscToINSDC and ucscToRefSeq table/track (DONE - 2018-01-05 - Hiram) # the sequence here is working for a 'refseq' assembly # beware of a chrM situation may be specific depending upon what is # available in the assembly mkdir /hive/data/genomes/papAnu4/bed/ucscToINSDC cd /hive/data/genomes/papAnu4/bed/ucscToINSDC grep chrM ../../*.agp # chrM 1 16516 1 O NC_020006.2 1 16516 + # if there is a chrM, use its INSDC name as a second argument: # this is a RefSeq assembly, use the chrM refSeq name: ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../refseq/GCF_*structure/Primary_Assembly NC_020006.2 # this is actually ucscToRefSeq since this is a RefSeq assembly sort ucscToINSDC.txt > ucscToRefSeq.txt rm -f ucscToINSDC.txt # there is also a genbank release, need to make idKeys to match it mkdir /hive/data/genomes/papAnu4/bed/ucscToINSDC/genbank cd /hive/data/genomes/papAnu4/bed/ucscToINSDC/genbank ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_mammalian/Papio_anubis/all_assembly_versions/GCA_000264685.2_Panu_3.0/GCA_000264685.2_Panu_3.0_genomic.fna.gz . faToTwoBit G*.fna.gz genbank.papAnu4.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/genbank.papAnu4.2bit genbankPapAnu4) > do.log 2>&1 & # real 43m33.965s cd /hive/data/genomes/papAnu4/bed/ucscToINSDC join -t$'\t' \ ../idKeys/papAnu4.idKeys.txt genbank/genbankPapAnu4.idKeys.txt \ | cut -f2- | sort > ucscToINSDC.txt awk '{printf "%s\t%s\n", $2, $1}' ucscToRefSeq.txt \ | sort > refSeqToUcsc.txt awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > ucsc.coordinate.tab join -t$'\t' ucsc.coordinate.tab ucscToRefSeq.txt > ucscToRefSeq.bed join -t$'\t' ucsc.coordinate.tab ucscToINSDC.txt > ucscToINSDC.bed # should be same line counts throughout: # genbank/INSDC is missing one: chrM wc -l * # 63235 refSeqToUcsc.txt # 63235 ucsc.coordinate.tab # 63234 ucscToINSDC.bed # 63234 ucscToINSDC.txt # 63235 ucscToRefSeq.bed # 63235 ucscToRefSeq.txt export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 # use the 20 in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab papAnu4 ucscToINSDC stdin ucscToINSDC.bed # should be the same for ucscToRefSeq: export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab papAnu4 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed # checkTableCoords should be silent checkTableCoords papAnu4 # each should cover %100 entirely: featureBits -countGaps papAnu4 ucscToINSDC # 2959356508 bases of 2959373024 (99.999%) in intersection featureBits -countGaps papAnu4 ucscToRefSeq # 2959373024 bases of 2959373024 (100.000%) in intersection # the INSDC is missing the chrM: calc 2959373024 - 2959356508 # 2959373024 - 2959356508 = 16516.000000 ######################################################################### # add chromAlias table (DONE - 2018-01-05 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/chromAlias cd /hive/data/genomes/papAnu4/bed/chromAlias # after ensembl idKeys have been made: join -t$'\t' ../idKeys/papAnu4.idKeys.txt \ ../../ensembl/ensemblPapAnu4.idKeys.txt | cut -f2- > ucsc.ensembl.tab hgsql -N -e 'select chrom,name from ucscToRefSeq;' papAnu4 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' papAnu4 \ > ucsc.genbank.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > papAnu4.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t papAnu4.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 63235 =? 63235 OK # checking genbank: 63234 =? 63234 OK # checking ensembl: 63234 =? 63234 OK hgLoadSqlTab papAnu4 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ papAnu4.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2018-01-05 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/baboon/papAnu4 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" papAnu4 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # 118810 AHZZ.1 # 1 NC_.2 # implies a rule: '[AN][CH][Z0-9_]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" papAnu4 | wc -l # 118811 hgsql -N -e "select frag from gold;" papAnu4 \ | egrep -e '[AN][CH][Z0-9_]+(\.[0-9]+)?' | wc -l # 118811 hgsql -N -e "select frag from gold;" papAnu4 \ | egrep -v -e '[AN][CH][Z0-9_]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/chicken/papAnu4/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][CH][Z0-9_]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box for these name patterns ########################################################################## # running repeat masker (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/repeatMasker cd /hive/data/genomes/papAnu4/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku papAnu4) > do.log 2>&1 & # real 645m57.789s egrep "bases|Total|masked" faSize.rmsk.txt \ | fold -s | sed -e 's/^/# /;' # 2959373024 bases (22371497 N's 2937001527 real 1396495695 upper 1540505832 # lower) in 63235 sequences in 1 files # Total size: mean 46799.6 sd 2508983.7 min 200 (chrUn_NW_018761261v1) max # 217458864 (chr1) median 1406 # %52.06 masked total, %52.45 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; * time featureBits -countGaps papAnu4 rmsk # 1540828237 bases of 2959373024 (52.066%) in intersection # real 1m2.441s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' papAnu4 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 1540828237.000000 # real 0m42.027s ########################################################################## # running simple repeat (DONE - 2018-01-03 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/simpleRepeat cd /hive/data/genomes/papAnu4/bed/simpleRepeat # using trf409 5 here a bit smaller genome (human == 6) time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 5 papAnu4) > do.log 2>&1 & # real 32m25.506s cat fb.simpleRepeat # 164309953 bases of 2937004939 (5.594%) in intersection # adding this trfMask to the other masking cd /hive/data/genomes/papAnu4 # when using the Window Masker result: # twoBitMask bed/windowMasker/papAnu4.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed papAnu4.2bit # you can safely ignore the warning about fields >= 13 # when using Rmsk results, add to rmsk after it is done: twoBitMask papAnu4.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed papAnu4.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa papAnu4.2bit stdout | faSize stdin > faSize.papAnu4.2bit.txt egrep "bases|Total|masked" faSize.papAnu4.2bit.txt \ | fold -s | sed -e 's/^/# /;' # 2959373024 bases (22371497 N's 2937001527 real 1394997875 upper 1542003652 # lower) in 63235 sequences in 1 files # Total size: mean 46799.6 sd 2508983.7 min 200 (chrUn_NW_018761261v1) max # 217458864 (chr1) median 1406 # %52.11 masked total, %52.50 masked real # reset the symlink rm /gbdb/papAnu4/papAnu4.2bit ln -s `pwd`/papAnu4.2bit /gbdb/papAnu4/papAnu4.2bit ######################################################################### # CREATE MICROSAT TRACK (DONE - 2018-01-04 - Hiram) ssh hgwdev mkdir /cluster/data/papAnu4/bed/microsat cd /cluster/data/papAnu4/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed papAnu4 microsat microsat.bed # Read 28387 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2018-01-04 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/windowMasker cd /hive/data/genomes/papAnu4/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev papAnu4) > do.log 2>&1 # real 213m54.340s # Masking statistics cat faSize.papAnu4.cleanWMSdust.txt egrep "bases|Total|masked" faSize.papAnu4.cleanWMSdust.txt \ | fold -s | sed -e 's/^/# /;' # 2959373024 bases (22371497 N's 2937001527 real 1791437797 upper 1145563730 # lower) in 63235 sequences in 1 files # Total size: mean 46799.6 sd 2508983.7 min 200 (chrUn_NW_018761261v1) max # 217458864 (chr1) median 1406 # %38.71 masked total, %39.00 masked real cat fb.papAnu4.rmsk.windowmaskerSdust.txt # 912138851 bases of 2959373024 (30.822%) in intersection ########################################################################## # cpgIslands - (DONE - 2018-01-04 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/cpgIslands cd /hive/data/genomes/papAnu4/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku papAnu4) > do.log 2>&1 & # real 9m20.979s cat fb.papAnu4.cpgIslandExt.txt # 21857619 bases of 2937004939 (0.744%) in intersection ############################################################################## # genscan - (DONE - 2018-01-04 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/genscan cd /hive/data/genomes/papAnu4/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku papAnu4) > do.log 2>&1 & # real 340m6.438s # 1 job failed, finished with window size 2000000 # real 29m0.534s time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -continue=makeBed -bigClusterHub=ku papAnu4) > makeBed.log 2>&1 & # real 4m29.529s cat fb.papAnu4.genscan.txt # 53061965 bases of 2937004939 (1.807%) in intersection cat fb.papAnu4.genscanSubopt.txt # 51930076 bases of 2937004939 (1.768%) in intersection ############################################################################# # Ensembl genes (DONE - 2018-01-05 - Hiram) # after chromAlias work is done: cd /hive/data/genomes/papAnu4/jkStuff join -t$'\t' <(sort -k1,1 ../chrom.sizes) \ <(sort ../bed/chromAlias/ucsc.ensembl.tab) \ | awk '{printf "0\t%s\t%d\t%s\t%d\n", $3,$2,$1,$2}' > ensToUcsc.lift cd /hive/data/genomes/papAnu4 printf "# required db variable db papAnu4 # specific lifting to translate names: liftUp /hive/data/genomes/papAnu4/jkStuff/ensToUcsc.lift skipInvalid yes # ENSPANT00000043361.1 txEnd 1439 >= chromSize 1433 " > papAnu4.ensGene.ra time (doEnsGeneUpdate.pl -ensVersion=91 papAnu4.ensGene.ra) \ > ensGene.91.log 2>&1 # real 3m19.001s featureBits papAnu4 ensGene # 44280398 bases of 2937004939 (1.508%) in intersection ############################################################################# # augustus gene track (DONE - 2018-01-04 - Hiram) mkdir /hive/data/genomes/papAnu4/bed/augustus cd /hive/data/genomes/papAnu4/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev -workhorse=hgwdev papAnu4) > do.log 2>&1 & # real 163m56.325s cat fb.papAnu4.augustusGene.txt # 48162863 bases of 2937004939 (1.640%) in intersection featureBits -enrichment papAnu4 augustusGene ensGene # augustusGene 1.640%, ensGene 1.508%, both 0.998%, cover 60.84%, enrich 40.35x ############################################################################## # lastz/chain/net swap human/hg38 (DONE - 2018-01-08 - Hiram) # original alignment cd /hive/data/genomes/hg38/bed/lastzPapAnu4.2018-01-08 cat fb.hg38.chainPapAnu4Link.txt # 2665048631 bases of 3049335806 (87.398%) in intersection cat fb.hg38.chainRBestPapAnu4Link.txt # 2462004619 bases of 3049335806 (80.739%) in intersection # and for the swap: mkdir /hive/data/genomes/papAnu4/bed/blastz.hg38.swap cd /hive/data/genomes/papAnu4/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzPapAnu4.2018-01-08/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 166m16.865s cat fb.papAnu4.chainHg38Link.txt # 2539481835 bases of 2937004939 (86.465%) in intersection cat fb.papAnu4.chainSynHg38Link.txt # 2485777123 bases of 2937004939 (84.636%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` papAnu4 hg38) \ > rbest.log 2>&1 & # real 483m48.216s cat fb.papAnu4.chainRBestHg38Link.txt # 2465241931 bases of 2937004939 (83.937%) in intersection ############################################################################## # lastz/chain/net swap mouse/mm10 (DONE - 2018-01-08 - Hiram) # original alignment to mm10 cd /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 cat fb.mm10.chainPapAnu4Link.txt # 919405716 bases of 2652783500 (34.658%) in intersection cat fb.mm10.chainRBestPapAnu4Link.txt # 875366631 bases of 2652783500 (32.998%) in intersection # and for the swap: mkdir /hive/data/genomes/papAnu4/bed/blastz.mm10.swap cd /hive/data/genomes/papAnu4/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 80m51.648s cat fb.papAnu4.chainMm10Link.txt # 907806517 bases of 2937004939 (30.909%) in intersection cat fb.papAnu4.chainSynMm10Link.txt # 866781916 bases of 2937004939 (29.512%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev papAnu4 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 521m7.590s cat fb.papAnu4.chainRBestMm10Link.txt # 874097827 bases of 2937004939 (29.762%) in intersection + # a download file was missing (DONE - 2019-03-29 - Hiram) + cd /hive/data/genomes/papAnu4/bed/lastz.mm10/axtChain + + time netToAxt papAnu4.mm10.rbest.net.gz papAnu4.mm10.rbest.chain.gz \ + /hive/data/genomes/papAnu4/papAnu4.2bit /hive/data/genomes/mm10/mm10.2bit stdout \ + | axtSort stdin stdout \ + | gzip -c > ../axtRBestNet/papAnu4.mm10.rbest.axt.gz + + # real 10m29.580s + # user 9m16.998s + # sys 0m33.232s + ############################################################################## # Create kluster run files (DONE - 2018-01-08 - Hiram) # numerator is papAnu4 gapless bases "real" as reported by: featureBits -noRandom -noHap papAnu4 gap # 9485884 bases of 2728012961 (0.348%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2728012961 / 2861349177 \) \* 1024 # ( 2728012961 / 2861349177 ) * 1024 = 976.282550 # ==> use -repMatch=1000 according to size scaled down from 1024 for human. # and rounded up to nearest 50 cd /hive/data/genomes/papAnu4 blat papAnu4.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/papAnu4.11.ooc \ -repMatch=1000 # Wrote 31544 overused 11-mers to jkStuff/papAnu4.11.ooc # papAnu3 was: -repMatch=1025 # Wrote 29129 overused 11-mers to jkStuff/papAnu3.11.ooc # papAnu2 at repMatch=1025 was: # Wrote 29128 overused 11-mers to jkStuff/papAnu2.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' papAnu4 \ | sort -k7,7nr | ave -col=7 stdin # all these gap sizes are 100 # minimum gap size is 100 and produces a reasonable number of lifts gapToLift -verbose=2 -minGap=10 papAnu4 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed ######################################################################### # LIFTOVER TO papAnu3 (DONE - 2018-01-08 - Hiram) ssh hgwdev mkdir /hive/data/genomes/papAnu4/bed/blat.papAnu3.2018-01-08 cd /hive/data/genomes/papAnu4/bed/blat.papAnu3.2018-01-08 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/papAnu4/jkStuff/papAnu4.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ papAnu4 papAnu3) > do.log 2>&1 # real 598m36.466s # verify the convert link on the test browser is now active from papAnu4 to # papAnu3 ######################################################################### # LIFTOVER TO papAnu2 (DONE - 2018-01-08 - Hiram) ssh hgwdev mkdir /hive/data/genomes/papAnu4/bed/blat.papAnu2.2018-01-08 cd /hive/data/genomes/papAnu4/bed/blat.papAnu2.2018-01-08 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/papAnu4/jkStuff/papAnu4.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ papAnu4 papAnu2) > do.log 2>&1 & # real 574m29.473s # verify the convert link on the test browser is now active from papAnu4 to # papAnu2 ######################################################################### # GENBANK AUTO UPDATE (DONE - 2018-01-08 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Papio anubis 512 145626 505 # Papio anubis anubis 6 0 0 # Papio cynocephalus 23 0 0 # Papio cynocephalus x Papio anubis 3 0 0 # Papio hamadryas 102 0 0 # Papio hamadryas hamadryas 1 0 0 # Papio papio 10 0 0 # Papio sp. 2 0 0 # Papio ursinus 24 0 0 # edit etc/genbank.conf to add papAnu4 just before papAnu3 # papAnu4 (Papio anubis - olive baboon taxId: 9555) papAnu4.serverGenome = /hive/data/genomes/papAnu4/papAnu4.2bit papAnu4.clusterGenome = /hive/data/genomes/papAnu4/papAnu4.2bit papAnu4.ooc = /hive/data/genomes/papAnu4/jkStuff/papAnu4.11.ooc papAnu4.lift = /hive/data/genomes/papAnu4/jkStuff/nonBridged.lft papAnu4.perChromTables = no papAnu4.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} papAnu4.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} papAnu4.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} papAnu4.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} papAnu4.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} papAnu4.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} papAnu4.downloadDir = papAnu4 # DO NOT NEED genbank.mrna.xeno except for human, mouse # defaults yes: genbank.mrna.native.load, genbank.mrna.native.loadDesc, # genbank.est.native.load, refseq.mrna.native.load, refseq.mrna.native.loadDesc, # refseq.mrna.xeno.load , refseq.mrna.xeno.loadDesc git commit -m 'adding papAnu4 Papio anubis - olive baboon refs #20769' \ etc/genbank.conf src/lib/gbGenome.c git push # update /cluster/data/genbank/: make install-server make etc-update cd /cluster/data/genbank time ./bin/gbAlignStep -initial papAnu4 # logFile: var/build/logs/2018.01.08-18:58:03.papAnu4.initalign.log # real 295m57.046s tail -2 var/build/logs/2018.01.08-18:58:03.papAnu4.initalign.log # hgwdev 2018.01.08-23:51:41 papAnu4.initalign: Succeeded: papAnu4 # hgwdev 2018.01.08-23:54:00 papAnu4.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.papAnu4 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad papAnu4 # logFile: var/dbload/hgwdev/logs/2018.01.09-07:47:01.papAnu4.dbload.log # real 18m48.462s tail -1 var/dbload/hgwdev/logs/2018.01.09-07:47:01.papAnu4.dbload.log # hgwdev 2018.01.09-08:05:49 papAnu4.dbload: finish # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add papAnu4 to: # etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs etc/hgwdev.dbs git commit -m 'adding papAnu4 to the update alignments refs #20769' \ etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################# # BLATSERVERS ENTRY (DONE - 2018-01-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("papAnu4", "blat1b", "17890", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("papAnu4", "blat1b", "17891", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to similar position as papAnu2 ## found by liftOver view in other (DONE - 2018-01-09 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr1:106647222-106657040" where name="papAnu4";' hgcentraltest ######################################################################### # all.joiner update, downloads and in pushQ - (DONE - 2018-01-10 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=papAnu4 -tableCoverage all.joiner joinerCheck -database=papAnu4 -times all.joiner joinerCheck -database=papAnu4 -keys all.joiner cd /hive/data/genomes/papAnu4 time (makeDownloads.pl -workhorse=hgwdev papAnu4) > downloads.log 2>&1 # real 26m41.219s # now ready for pushQ entry mkdir /hive/data/genomes/papAnu4/pushQ cd /hive/data/genomes/papAnu4/pushQ time (makePushQSql.pl -redmineList papAnu4) > papAnu4.pushQ.sql 2> stderr.out # real 5m7.875s # check for errors in stderr.out, some are OK, e.g.: # WARNING: papAnu4 does not have seq # WARNING: papAnu4 does not have extFile ## there are warnings about the RBest and Syn chainNet tables, which we ## are not interested in at this time. They can be left out. # verify the file listings are valid, should be no output to stderr: cat redmine.papAnu4.file.list \ | while read L; do ls -ogL $L; done > /dev/null # to verify the database.table list is correct, should be the same # line count for these two commands: wc -l redmine.papAnu4.table.list # 75 redmine.papAnu4.table.list awk -F'.' '{ printf "hgsql -N -e \"show table status like '"'"'%s'"'"';\" %s\n", $2, $1 }' redmine.papAnu4.table.list | while read L; do eval $L; done | wc -l # 75 # enter the path names to these files in the redmine issue to # make QA Ready: ls `pwd`/redmine* /hive/data/genomes/papAnu4/pushQ/redmine.papAnu4.file.list /hive/data/genomes/papAnu4/pushQ/redmine.papAnu4.releaseLog.txt /hive/data/genomes/papAnu4/pushQ/redmine.papAnu4.table.list #########################################################################