76fba02a1a3d0f8a66beebaafce75b2770407522 hiram Tue Jun 16 12:54:12 2020 -0700 liftOvers to panPan3 refs #25720 diff --git src/hg/makeDb/doc/panPan2/initialBuild.txt src/hg/makeDb/doc/panPan2/initialBuild.txt index 5b1620a..efa3b62 100644 --- src/hg/makeDb/doc/panPan2/initialBuild.txt +++ src/hg/makeDb/doc/panPan2/initialBuild.txt @@ -1,698 +1,717 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the panPan2 # Bonobo -- # this is an update to panPan1 sequence to fix big errors in their chrom # structures # chrMT listed in assembly ASM225v1 Mmul_051212 == NC_005943.1 ############################################################################# # fetch sequence from new style download directory (DONE - 2016-01-08 - Hiram) mkdir -p /hive/data/genomes/panPan2/refseq cd /hive/data/genomes/panPan2/refseq time rsync -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/refseq/vertebrate_mammalian/Pan_paniscus/all_assembly_versions/GCF_000258655.2_panpan1.1/ ./ # real 6m9.041s # sent 2810 bytes received 3928893248 bytes 10633006.92 bytes/sec # total size is 3928403393 speedup is 1.00 # measure what we have here: faSize GCF_000258655.2_panpan1.1_genomic.fna.gz # 3286643896 bases (560753890 N's 2725890006 real 1745980756 upper 979909250 # lower) in 10274 sequences in 1 files # Total size: mean 319899.2 sd 6899023.6 min 217 (NW_014024393.1) max 247869975 (NC_027870.1) median 1230 # %29.81 masked total, %35.95 masked real time faCount GCF_000258655.2_panpan1.1_genomic.fna.gz | less # #seq len A C G T N cpg # total 3286643896 806971547 555159394 555452392 808306673 560753890 25946869 # real 1m27.422s ############################################################################# # fixup to UCSC naming scheme (DONE - 2016-04-18 - Hiram) mkdir /hive/data/genomes/panPan2/ucsc cd /hive/data/genomes/panPan2/ucsc time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../refseq/*_genomic.fna.gz ../refseq/*_assembly_structure/Primary_Assembly NC_027868.1 chr1 NC_027869.1 chr2A NC_027870.1 chr2B NC_027871.1 chr3 NC_027872.1 chr4 NC_027873.1 chr5 NC_027874.1 chr6 NC_027875.1 chr7 NC_027876.1 chr8 NC_027877.1 chr9 NC_027878.1 chr10 NC_027879.1 chr11 NC_027880.1 chr12 NC_027881.1 chr13 NC_027882.1 chr14 NC_027883.1 chr15 NC_027884.1 chr16 NC_027885.1 chr17 NC_027886.1 chr18 NC_027887.1 chr19 NC_027888.1 chr20 NC_027889.1 chr21 NC_027890.1 chr22 NC_027891.1 chrX real 17m57.979s time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # processed 10249 sequences into chrUn.fa.gz # real 4m35.540s # bash syntax here mitoAcc=`grep "^# mitoAcc" ../panPan2.config.ra | awk '{print $NF}'` printf "# mitoAcc %s\n" "$mitoAcc" # mitoAcc NC_001644.1 zcat \ ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp printf ">chrM\n" > chrM.fa twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa gzip chrM.fa # verify fasta and AGP match: time faToTwoBit chr*.fa.gz test.2bit # real 1m45.015s cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail # All AGP and FASTA entries agree - both files are valid # verify nothing lost compared to genbank: faSize *.fa.gz # 3286643896 bases (560753890 N's 2725890006 real 2725890006 upper 0 lower) in 10274 sequences in 26 files # Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1) max 247869975 (chr2B) median 1230 # same totals as above: # 3286643896 bases (560753890 N's 2725890006 real 1745980756 upper 979909250 # lower) in 10274 sequences in 1 files # Total size: mean 319899.2 sd 6899023.6 min 217 (NW_014024393.1) max 247869975 (NC_027870.1) median 1230 ############################################################################# # Initial database build (DONE - 2016-04-18 - Hiram) # construct the required photoReference.txt cd /hive/data/genomes/panPan2 printf "photoCreditURL http://a-z-animals.com/animals/bonobo/pictures/2955/ photoCreditName Photo courtesy of Kabir Bakie\n" > photoReference.txt # this almost functioned OK. It couldn't find a commonName or the # bioProject ~/kent/src/hg/utils/automation/prepConfig.pl panPan2 mammal bonobo \ refseq/*_assembly_report.txt > panPan2.config.ra cat panPan2.config.ra # config parameters for makeGenomeDb.pl: db panPan2 clade mammal genomeCladePriority 15 scientificName Pan paniscus commonName Bonobo assemblyDate Aug. 2015 assemblyLabel Max-Planck Institute for Evolutionary Anthropology assemblyShortLabel MPI-EVA panpan1.1 orderKey 2624 # mitochondrial sequence included in refseq release # mitoAcc NC_001644.1 # http://www.ncbi.nlm.nih.gov/bioproject/PRJNA11815 mitoAcc none fastaFiles /hive/data/genomes/panPan2/ucsc/*.fa.gz agpFiles /hive/data/genomes/panPan2/ucsc/*.agp # qualFiles none dbDbSpeciesDir bonobo photoCreditURL http://a-z-animals.com/animals/bonobo/pictures/2955/ photoCreditName Photo courtesy of Kabir Bakie ncbiGenomeId 10729 ncbiAssemblyId 474211 ncbiAssemblyName panpan1.1 ncbiBioProject 169343 ncbiBioSample SAMEA1029457 genBankAccessionID GCF_000258655.2 taxId 9597 # verify sequence and AGP are OK: time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp panPan2.config.ra > agp.log 2>&1 # real 3m58.459s # then finish it off: time (~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse=hgwdev \ -dbHost=hgwdev -fileServer=hgwdev -continue=db \ panPan2.config.ra ) > db.log 2>&1 # real 29m36.731s # check in the trackDb files created and add to trackDb/makefile # temporary symlink until after masking ln -s `pwd`/panPan2.unmasked.2bit /gbdb/panPan2/panPan2.2bit ############################################################################# # cytoBandIdeo - (DONE - 2016-04-19 - Hiram) mkdir /hive/data/genomes/panPan2/bed/cytoBand cd /hive/data/genomes/panPan2/bed/cytoBand makeCytoBandIdeo.csh panPan2 ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2016-04-19 - Hiram) mkdir /hive/data/genomes/panPan2/bed/cpgIslandsUnmasked cd /hive/data/genomes/panPan2/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/panPan2/panPan2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku panPan2) > do.log 2>&1 # real 22m23.619s cat fb.panPan2.cpgIslandExtUnmasked.txt # 23203460 bases of 2725937399 (0.851%) in intersection ############################################################################# # running repeat masker (DONE - 2016-04-19,05-04 - Hiram) mkdir /hive/data/genomes/panPan2/bed/repeatMasker cd /hive/data/genomes/panPan2/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku panPan2) > do.log 2>&1 & # real 805m20.249s # had one missing ID failure: #RepeatMasker bug?: Undefined id, line 3391703 of input: # 2118 12.4 3.8 2.4 chr4 152298481 152298818 (43505445) + MER61C LTR/ERV1 86 428 (0) # At least one ID was missing (see warnings above) -- please report to Robert Hubley. -continue at your disgression. # cleaning out the single bad record grep -v "152298481 152298818" panPan2.sorted.fa.out > panPan2.cleaned.fa.out mv panPan2.sorted.fa.out panPan2.sorted.fa.out.badRecord mv panPan2.cleaned.fa.out panPan2.sorted.fa.out # continuing: time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -continue=mask -smallClusterHub=ku panPan2) > mask.log 2>&1 cat faSize.rmsk.txt # 3286643896 bases (560753890 N's 2725890006 real 1348347850 upper # 1377542156 lower) in 10274 sequences in 1 files # Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1) # max 247869975 (chr2B) median 1230 # %41.91 masked total, %50.54 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; time featureBits -countGaps panPan2 rmsk # 1378308706 bases of 3286643896 (41.937%) in intersection # real 0m56.056s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' panPan2 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 1378308706.000000 # real 0m36.998s ########################################################################## # running simple repeat (DONE - 2016-04-19 - Hiram) mkdir /hive/data/genomes/panPan2/bed/simpleRepeat cd /hive/data/genomes/panPan2/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ panPan2) > do.log 2>&1 & # real 20m23.012s cat fb.simpleRepeat # 53570869 bases of 2725937399 (1.965%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/panPan2 twoBitMask panPan2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed panPan2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa panPan2.2bit stdout | faSize stdin > faSize.panPan2.2bit.txt cat faSize.panPan2.2bit.txt # 3286643896 bases (560753890 N's 2725890006 real 1346906367 upper # 1378983639 lower) in 10274 sequences in 1 files # Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1) # max 247869975 (chr2B) median 1230 # %41.96 masked total, %50.59 masked real rm /gbdb/panPan2/panPan2.2bit ln -s `pwd`/panPan2.2bit /gbdb/panPan2/panPan2.2bit ############################################################################# # CREATE MICROSAT TRACK (DONE - 2016-04-19 - Hiram) ssh hgwdev mkdir /cluster/data/panPan2/bed/microsat cd /cluster/data/panPan2/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed panPan2 microsat microsat.bed # Read 26743 elements of size 4 from microsat.bed ############################################################################# # ucscToINSDC table/track (DONE - 2016-04-19 - Hiram) # the sequence here is working for a 'refseq' assembly with a chrM # situation may be specific depending upon what is available in the assembly mkdir /hive/data/genomes/panPan2/bed/ucscToINSDC cd /hive/data/genomes/panPan2/bed/ucscToINSDC # find accession for chrM grep chrM ../../panPan2.agp # chrM 1 16563 1 O NC_001644.1 1 16563 + # use that accession here: ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../refseq/GCF_*structure/Primary_Assembly NC_001644.1 awk '{printf "%s\t%s\n", $2, $1}' ucscToINSDC.txt | sort > insdcToUcsc.txt # there is no name for chrM/NC_007393.1 sequence, there is no such # sequence with an INSDC name grep -v "^#" ../../refseq/GCF*_assembly_report.txt | cut -f5,7 \ | sed -e 's/na\b/notAvailable/;' | awk '{printf "%s\t%s\n", $2, $1}' \ | sort > insdc.refseq.txt # the sed \b means to match word awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab # the tr commands avoid the problem of trying to use the -t argument # to the join command which doesn't accept -t'\t' but instead has # to use the unseen/can not copy command ctrl-v i join insdc.refseq.txt insdcToUcsc.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-3,5 \ > ucscToINSDC.bed # should be same line counts throughout: wc -l * # 2490 insdc.refseq.txt # 2490 insdcToUcsc.txt # 2490 name.coordinate.tab # 2490 ucscToINSDC.bed # 2490 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab panPan2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords panPan2 # should cover %100 entirely: featureBits -countGaps panPan2 ucscToINSDC # 3286643896 bases of 3286643896 (100.000%) in intersection join -1 2 <(sort -k2 ucscToINSDC.txt) insdc.refseq.txt | tr '[ ]' '[\t]' \ | sort -k2 | join -2 2 name.coordinate.tab - | tr '[ ]' '[\t]' \ | cut -f1-4 > ucscToRefSeq.bed cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab panPan2 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed checkTableCoords panPan2 -table=ucscToRefSeq # should cover %100 all bases: featureBits -countGaps panPan2 ucscToRefSeq # 3286643896 bases of 3286643896 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2017-02-27 - Hiram) mkdir /hive/data/genomes/panPan2/bed/chromAlias cd /hive/data/genomes/panPan2/bed/chromAlias hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' panPan2 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' panPan2 \ > ucsc.genbank.tab awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \ | sort > panPan2.chromAlias.tab hgLoadSqlTab panPan2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ panPan2.chromAlias.tab cd /hive/data/genomes/panPan2/bed/chromAlias # add ensembl names 2017-12-14 mkdir previous mv *.tab previous join -t$'\t' ../idKeys/panPan2.idKeys.txt \ ../../ensembl/ensemblPanPan2.idKeys.txt \ | cut -f2,3 | sort > ucsc.ensembl.tab cut -f1,2 previous/ucsc.refseq.tab > ucsc.refseq.tab cut -f1,2 previous/ucsc.genbank.tab > ucsc.genbank.tab ~/kent/src/hg/utils/automation/chromAlias.pl sort -o panPan2.chromAlias.tab panPan2.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t panPan2.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 10274 =? 10274 OK # checking genbank: 10274 =? 10274 OK # checking ensembl: 10274 =? 10274 OK hgLoadSqlTab panPan2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ panPan2.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2016-04-19 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/bonobo/panPan2 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" panPan2 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c # 121336 AJFE.1 # 1 NC_.1 # implies a search rule of: '[ACEFJN_]+[0-9]+(\.[0-9]+)?' # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" panPan2 | wc -l # 121337 hgsql -N -e "select frag from gold;" panPan2 \ | egrep -e '[ACEFJN_]+[0-9]+(\.[0-9]+)?' | wc -l # 121337 hgsql -N -e "select frag from gold;" panPan2 \ | egrep -v -e '[ACEFJN_]+[0-9]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/rhesus/panPan2/trackDb.ra searchTable gold shortCircuit 1 termRegex [ACEFJN_]+[0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## ## WINDOWMASKER (DONE - 2016-05-04 - Hiram) mkdir /hive/data/genomes/panPan2/bed/windowMasker cd /hive/data/genomes/panPan2/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev panPan2) > do.log 2>&1 # real 379m56.649s # Masking statistics cat faSize.panPan2.cleanWMSdust.txt # 3286643896 bases (560753890 N's 2725890006 real 1730579207 upper # 995310799 lower) in 10274 sequences in 1 files # Total size: mean 319899.2 sd 6899023.6 min 217 (chrUn_NW_014024393v1) # max 247869975 (chr2B) median 1230 # %30.28 masked total, %36.51 masked real cat fb.panPan2.rmsk.windowmaskerSdust.txt # 768236951 bases of 3286643896 (23.375%) in intersection ########################################################################## # cpgIslands - (DONE - 2016-05-05 - Hiram) mkdir /hive/data/genomes/panPan2/bed/cpgIslands cd /hive/data/genomes/panPan2/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku panPan2) > do.log 2>&1 # real 6m54.203s cat fb.panPan2.cpgIslandExt.txt # 17035990 bases of 2725937399 (0.625%) in intersection ############################################################################## # ncbiRefSeq gene track (DONE - 2016-05-05 - Hiram) mkdir /hive/data/genomes/panPan2/bed/ncbiRefSeq cd /hive/data/genomes/panPan2/bed/ncbiRefSeq # working on this script, running step by step: time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \ -stop=download -buildDir=`pwd` -bigClusterHub=ku \ -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \ refseq vertebrate_mammalian Pan_paniscus \ GCF_000258655.2_panpan1.1 panPan2) > download.log 2>&1 # real 12m36.320s time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \ -continue=process -stop=process -buildDir=`pwd` -bigClusterHub=ku \ -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \ refseq vertebrate_mammalian Pan_paniscus \ GCF_000258655.2_panpan1.1 panPan2) > process.log 2>&1 # real 4m22.621s time (/cluster/home/hiram/kent/src/hg/utils/automation/doNcbiRefSeq.pl \ -continue=load -stop=load -buildDir=`pwd` -bigClusterHub=ku \ -fileServer=hgwdev -workhorse=hgwdev -smallClusterHub=ku -dbHost=hgwdev \ refseq vertebrate_mammalian Pan_paniscus \ GCF_000258655.2_panpan1.1 panPan2) > load.log 2>&1 # real 0m21.690s cat fb.ncbiRefSeq.panPan2.txt # 74646536 bases of 2725937399 (2.738%) in intersection ############################################################################## # genscan - (DONE - 2016-05-05 - Hiram) mkdir /hive/data/genomes/panPan2/bed/genscan cd /hive/data/genomes/panPan2/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku panPan2) > do.log 2>&1 # Completed: 10270 of 10274 jobs # Crashed: 2 jobs # CPU time in finished jobs: 450524s 7508.73m 125.15h 5.21d 0.014 y # IO & Wait Time: 27120s 452.00m 7.53h 0.31d 0.001 y # Time in running jobs: 167787s 2796.45m 46.61h 1.94d 0.005 y # Average job time: 47s 0.78m 0.01h 0.00d # Longest running job: 83913s 1398.55m 23.31h 0.97d # Longest finished job: 64597s 1076.62m 17.94h 0.75d # Submission to last job: 64787s 1079.78m 18.00h 0.75d # two jobs failed due to almost all N's in the hard mask sequence and # they sneaked through the check for that, continuing: time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -continue=makeBed -bigClusterHub=ku panPan2) > makeBed.log 2>&1 # real 2m23.103s cat fb.panPan2.genscan.txt # 45997720 bases of 2725937399 (1.687%) in intersection cat fb.panPan2.genscanSubopt.txt # 45930774 bases of 2725937399 (1.685%) in intersection ############################################################################# # augustus gene track (DONE - 2016-05-11 - Hiram) mkdir /hive/data/genomes/panPan2/bed/augustus cd /hive/data/genomes/panPan2/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev \ -workhorse=hgwdev panPan2) > do.log 2>&1 # real 95m5.905s cat fb.panPan2.augustusGene.txt # 49244982 bases of 3142093174 (1.567%) in intersection ######################################################################### # Create kluster run files (DONE - 2016-05-11 - Hiram) # numerator is panPan2 gapless bases "real" as reported by: featureBits -noRandom -noHap panPan2 gap # 469457882 bases of 2682465908 (17.501%) in intersection # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2682465908 / 2861349177 \) \* 1024 # ( 2682465908 / 2861349177 ) * 1024 = 959.982484 # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 100 cd /hive/data/genomes/panPan2 time blat panPan2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/panPan2.11.ooc \ -repMatch=900 # Wrote 34618 overused 11-mers to jkStuff/panPan2.11.ooc # real 0m56.353s # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' panPan2 \ | sort -k7,7nr # minimum size is 1000 # decide on a minimum gap for this break, use either 100 or 5000 will # generate 13387 liftOver rows, but if use 6000, only got 11703 rows. # so use 100 here to get more liftOver row. gapToLift -verbose=2 -minGap=1000 panPan2 jkStuff/panPan2.nonBridged.lft \ -bedFile=jkStuff/panPan2.nonBridged.bed ######################################################################## # GENBANK AUTO UPDATE (DONE - 2016-05-17 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Pan paniscus 440 1 46 # edit etc/genbank.conf to add panPan2 just before panPan1 # panPan2 (bonobo - Pan paniscus) panPan2.serverGenome = /hive/data/genomes/panPan2/panPan2.2bit panPan2.clusterGenome = /hive/data/genomes/panPan2/panPan2.2bit panPan2.ooc = /hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc panPan2.lift = /hive/data/genomes/panPan2/jkStuff/panPan2.nonBridged.lft panPan2.perChromTables = no panPan2.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} panPan2.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} panPan2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} panPan2.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} panPan2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} panPan2.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} panPan2.downloadDir = panPan2 # defaults yes genbank.mrna.native, genbank.est.native.load, # refseq.mrna.native, refseq.mrna.xeno # DO NOT NEED genbank.mrna.xeno except for human, mouse panPan2.genbank.est.native.load = no git commit -m "Added panPan2; refs #16036" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # Edit src/lib/gbGenome.c to add new species. Skipped screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial panPan2 # logFile: var/build/logs/2016.05.11-14:44:15.panPan2.initalign.log # real 196m8.278s tail -2 var/build/logs/2016.05.11-14:44:15.panPan2.initalign.log # hgwdev 2016.05.11-17:58:29 panPan2.initalign: Succeeded: panPan2 # hgwdev 2016.05.11-18:00:23 panPan2.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.panPan2 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad panPan2 # logFile: var/dbload/hgwdev/logs/2016.05.13-16:11:17.panPan2.dbload.log # real 11m44.667s tail -1 var/dbload/hgwdev/logs/2016.05.13-16:11:17.panPan2.dbload.log # hgwdev 2016.05.13-16:23:02 panPan2.dbload: finish # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add panPan2 to: # etc/align.dbs # etc/hgwdev.dbs git add etc/align.dbs git add etc/hgwdev.dbs git commit -m "Added panPan2 - Bonobo refs #16036" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################## # LIFTOVER TO panPan1 (DONE - 2016-05-17 - Hiram) ssh hgwdev mkdir /hive/data/genomes/panPan2/bed/blat.panPan1.2016-05-17 cd /hive/data/genomes/panPan2/bed/blat.panPan1.2016-05-17 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ panPan2 panPan1 time (doSameSpeciesLiftOver.pl -verbose=2 \ -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ panPan2 panPan1) > doLiftOverToPanPan1.log 2>&1 # real 156m48.953s # verify this functions in the genome browser from panPan2 to panPan1 ######################################################################### # BLATSERVERS ENTRY (DONE - 2017-02-06 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panPan2", "blat1c", "17884", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("panPan2", "blat1c", "17885", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ############################################################################## # set default position to FOXP2 gene displays (DONE - 2017-02-06 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr7:119267067-119359255" where name="panPan2";' \ hgcentraltest ############################################################################## # all.joiner update, downloads and in pushQ - (DONE - 2017-02-06 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=panPan2 -tableCoverage all.joiner joinerCheck -database=panPan2 -times all.joiner joinerCheck -database=panPan2 -keys all.joiner cd /hive/data/genomes/panPan2 time (makeDownloads.pl panPan2) > downloads.log 2>&1 # real 20m55.262s # now ready for pushQ entry mkdir /hive/data/genomes/panPan2/pushQ cd /hive/data/genomes/panPan2/pushQ time (makePushQSql.pl panPan2) > panPan2.pushQ.sql 2> stderr.out # real 3m46.855s # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/panPan2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/panPan2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/panPan2/bbi/quality.bw # WARNING: panPan2 does not have seq # WARNING: panPan2 does not have extFile # copy it to hgwbeta # copy it to hgwbeta scp -p panPan2.pushQ.sql qateam@hgwbeta:/tmp/ ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/panPan2.pushQ.sql" # in that pushQ entry walk through each entry and see if the # sizes will set properly +############################################################################## +# LIFTOVER TO panPan3 (DONE - 2020-06-15 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/panPan2/bed/blat.panPan3.2020-06-15 + cd /hive/data/genomes/panPan2/bed/blat.panPan3.2020-06-15 + + doSameSpeciesLiftOver.pl -verbose=2 \ + -debug -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + panPan2 panPan3 + + time (doSameSpeciesLiftOver.pl -verbose=2 \ + -ooc=/hive/data/genomes/panPan2/jkStuff/panPan2.11.ooc \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + panPan2 panPan3) > doLiftOverToPanPan3.log 2>&1 + # real 312m42.976s + + # verify this functions in the genome browser from panPan2 to panPan3 + #########################################################################