56f91678d7671b26ab3d70711429e52c26585a03 hiram Mon Nov 6 13:21:34 2023 -0800 adding crispr tracks for danRer10 danRer11 refs #21863 diff --git src/hg/makeDb/doc/danRer11/initialBuild.txt src/hg/makeDb/doc/danRer11/initialBuild.txt index 4060a5b..3315167 100644 --- src/hg/makeDb/doc/danRer11/initialBuild.txt +++ src/hg/makeDb/doc/danRer11/initialBuild.txt @@ -1,754 +1,853 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the danRer11 # # Can use existing photograph (otherwise find one before starting here) ######################################################################### # Initial steps, find photograph (DONE - 2017-05-24 - Chris) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/danRer11 cd ~/kent/src/hg/makeDb/doc/danRer11 sed -e 's/galGal5/danRer11/g; s/GalGal5/DanRer11/g; s/TBD/TBD/g;' \ ../galGal5/initialBuild.txt > initialBuild.txt # the files required are probably already here, take a look into: # /hive/data/outside/ncbi/genomes/refseq/<subSet>/<scientificName>/all_assembly_versions # and merely symlink them in: mkdir -p /hive/data/genomes/danRer11/genbank cd /hive/data/genomes/danRer11/genbank ln -s /hive/data/outside/ncbi/genomes/genbank/vertebrate_other/Danio_rerio/all_assembly_versions/GCA_000002035.4_GRCz11/* ./ # Can use existing photograph # Link to the existing photo. cd ../ ln -s ../danRer10/photoReference.txt . # Assembly name: GRCz11 # Description: Genome Reference Consortium Zebrafish Build 11 # Organism name: Danio rerio (zebrafish) # Infraspecific name: strain=Tuebingen # Taxid: 7955 # BioSample: SAMN06930106 # BioProject: PRJNA11776 # Submitter: Genome Reference Consortium # Date: 2017-5-9 # Assembly type: haploid-with-alt-loci # Release type: major # Assembly level: Chromosome # Genome representation: full # RefSeq category: Reference Genome # GenBank assembly accession: GCA_000002035.4 # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000000175.4 Primary Assembly ## GCA_002119155.1 ALT_DRER_TU_1 ## GCA_002119165.1 ALT_DRER_TU_2 ## GCA_002119175.1 ALT_DRER_TU_3 ## GCA_002119185.1 ALT_DRER_TU_4 ############################################################################# # establish config.ra file (DONE - Chris - 2017-05-24) cd /hive/data/genomes/danRer11 ~/kent/src/hg/utils/automation/prepConfig.pl danRer11 vertebrate zebrafish \ genbank/*_assembly_report.txt > danRer11.config.ra # verify it looks sane cat danRer11.config.ra # config parameters for makeGenomeDb.pl: db danRer11 clade vertebrate scientificName Danio rerio commonName Zebrafish assemblyDate May 2017 assemblyLabel Genome Reference Consortium assemblyShortLabel GRCz11 orderKey 26172 mitoAcc NC_002333.2 fastaFiles /hive/data/genomes/danRer11/ucsc/*.fa.gz agpFiles /hive/data/genomes/danRer11/ucsc/*.agp # qualFiles none dbDbSpeciesDir zebrafish photoCreditURL http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Zebrafish&id=79130 photoCreditName NHGRI Press Photos ncbiGenomeId 50 ncbiAssemblyId 1104621 ncbiAssemblyName GRCz11 ncbiBioProject 11776 ncbiBioSample SAMN06930106 genBankAccessionID GCA_000002035.4 taxId 7955 ############################################################################# # setup UCSC named files (DONE - 2017-06-01 - Chris) mkdir /hive/data/genomes/danRer11/ucsc cd /hive/data/genomes/danRer11/ucsc # measure what is in the refseq release: faSize ../genbank/*genomic.fna.gz #1679186873 bases (4693618 N's 1674493255 real 861444935 upper 813048320 lower) in 1922 sequences in 1 files #Total size: mean 873666.4 sd 6187603.1 min 650 (KN150525.1) max 78093715 (CM002888.2) median 149621 #N count: mean 2442.0 sd 64890.9 #U count: mean 448202.4 sd 3139997.1 #L count: mean 423022.0 sd 3024074.2 #%48.42 masked total, %48.55 masked real # check for duplicate sequences: time faToTwoBit -noMask ../genbank/*_genomic.fna.gz genbank.2bit # real 0m52.302s twoBitDup genbank.2bit # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this genbank.2bit file # remove it later # new option required to ucscCompositeAgp.pl 2016-04-13 time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../genbank/*_genomic.fna.gz ../genbank/*_assembly_structure/Primary_Assembly # constructing refseq.2bit from ../genbank/GCA_000002035.4_GRCz11_genomic.fna.gz CM002885.2 chr1 CM002886.2 chr2 CM002887.2 chr3 CM002888.2 chr4 CM002889.2 chr5 CM002890.2 chr6 CM002891.2 chr7 CM002892.2 chr8 CM002893.2 chr9 CM002894.2 chr10 CM002895.2 chr11 CM002896.2 chr12 CM002897.2 chr13 CM002898.2 chr14 CM002899.2 chr15 CM002900.2 chr16 CM002901.2 chr17 CM002902.2 chr18 CM002903.2 chr19 CM002904.2 chr20 CM002905.2 chr21 CM002906.2 chr22 CM002907.2 chr23 CM002908.2 chr24 CM002909.2 chr25 real 8m20.666s user 8m33.194s time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../genbank/*_assembly_structure/Primary_Assembly # Are no unlocalized #./unlocalizedWithChroms.pl ../genbank/*_assembly_structure/Primary_Assembly # Handle the alternative sequences time ucscAltAgp ../genbank/GCA_000002035.4_GRCz11_assembly_structure/all_alt_scaffold_placement.txt ../genbank/GCA_000002035.4_GRCz11_assembly_structure/ # Finished creating alt_4.agp # Finished creatingalt_4.fa.gz # Finished creating alt_2.agp # Finished creatingalt_2.fa.gz # Finished creating alt_1.agp # Finished creatingalt_1.fa.gz # Finished creating alt_3.agp # Finished creatingalt_3.fa.gz # real 1m49.141s # user 1m47.337s # sys 0m1.237s # verify fasta and AGPs agree time faToTwoBit *.fa.gz test.2bit # real 0m46.414s cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 # All AGP and FASTA entries agree - both files are valid # and no sequence lost from orginal: twoBitToFa test.2bit stdout | faSize stdin # 1679186873 bases (4693618 N's 1674493255 real 1674493255 upper 0 lower) in 1922 sequences in 1 files # Total size: mean 873666.4 sd 6187603.1 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 149621 # N count: mean 2442.0 sd 64890.9 # U count: mean 871224.4 sd 6162220.4 # L count: mean 0.0 sd 0.0 # %0.00 masked total, %0.00 masked real # same numbers as above (except for upper/lower masking) # no longer need these temporary 2bit files rm test.2bit genbank.2bit ############################################################################# # Initial database build (DONE - 2016-06-01 - Chris) cd /hive/data/genomes/danRer11 # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp danRer11.config.ra) > agp.log 2>&1 # real 1m40.249s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db danRer11.config.ra) > db.log 2>&1 # real 11m9.465s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add danRer11 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/danRer11 ln -s `pwd`/danRer11.unmasked.2bit /gbdb/danRer11/danRer11.2bit ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2017-06-01 - Chris) mkdir /hive/data/genomes/danRer11/bed/cpgIslandsUnmasked cd /hive/data/genomes/danRer11/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/danRer11/danRer11.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku danRer11) > do.log 2>&1 # real 6m59.252s cat fb.danRer11.cpgIslandExtUnmasked.txt # 34662385 bases of 1674677181 (2.070%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2017-06-02 - Chris) mkdir /hive/data/genomes/danRer11/bed/cytoBand cd /hive/data/genomes/danRer11/bed/cytoBand makeCytoBandIdeo.csh danRer11 ######################################################################### # ucscToINSDC table/track (DONE - 2016-04-14 - Chris) # the sequence here is working for a 'refseq' assembly with a chrM # situation may be specific depending upon what is available in the assembly mkdir /hive/data/genomes/danRer11/bed/ucscToINSDC cd /hive/data/genomes/danRer11/bed/ucscToINSDC # find accession for chrM grep chrM ../../danRer11.agp #chrM 1 16596 2 F NC_002333.2 1 16596 + # use that accession here: ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../genbank/GCA_*structure/Primary_Assembly NC_002333.2 # Copy over the ucscToINSDC.txt file from idKeys cp ../idKeys/idMap.txt . awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab # Put in the chrm refseq: NC_002333.2, INSDC: AC024175.3 manually join name.coordinate.tab <(sort idMap.txt) | tr '[ ]' '[\t]' \ > ucscToINSDC.bed # should be same line counts throughout: wc -l * # 1922 3844 46128 insdc.refseq.txt # 1922 3844 56859 insdcToUcsc.txt # 1923 5769 51957 name.coordinate.tab # 1923 7688 73086 ucscToINSDC.bed # 1922 3844 56859 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 20 # use the 20 in this sed sed -e "s/21/20/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab danRer11 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords danRer11 # should cover %100 entirely: featureBits -countGaps danRer11 ucscToINSDC # 1679203469 bases of 1679203469 (100.000%) in intersection ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2016-10-20 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/zebrafish/danRer11 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" danRer11 \ | sed -e 's/[0-9][0-9]*//; s/\.[0-9]\+$//' | sort | uniq -c 984 AL 4518 BX 5 CAAK 18560 CABZ 3251 CR 1015 CT 3213 CU 31 CZQB 936 FO 873 FP 347 FQ 1 LK 876 LO 6 LT 1 NC_ # implies a rule: '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" danRer11 | wc -l # 34617 hgsql -N -e "select frag from gold;" danRer11 \ | egrep -e '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?' | wc -l # 34617 hgsql -N -e "select frag from gold;" danRer11 \ | egrep -v -e '[ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/zebrafish/danRer11/trackDb.ra searchTable gold shortCircuit 1 termRegex [ABCFLN][A-Z]+[0-9_][0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (DONE - 2017-06-07 - Chris) mkdir /hive/data/genomes/danRer11/bed/repeatMasker cd /hive/data/genomes/danRer11/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku danRer11) > do.log 2>&1 # real 254m18.208s # 9 jobs (out of 3664) crashed in the cluster, they were re run manually and # completed fine, taking ~90 minute each. cat faSize.rmsk.txt # 1679203469 bases (4693618 N's 1674509851 real 809208275 upper 865301576 lower) in 1923 sequences in 1 files # Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921 # N count: mean 2440.8 sd 64874.0 # U count: mean 420805.1 sd 2950984.7 # L count: mean 449974.8 sd 3210628.1 # %51.53 masked total, %51.67 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # grep version of RepeatMasker$ /scratch/data/RepeatMasker/RepeatMasker # January 31 2015 (open-4-0-5) version of RepeatMasker # grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl # CC RELEASE 20140131; time featureBits -countGaps danRer11 rmsk # 865410761 bases of 1679203469 (51.537%) in intersection # real 0m28.805s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' danRer11 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 865410761.000000 # real 0m28.896s ########################################################################## # running simple repeat (DONE - 2017-06-07 - Chris) mkdir /hive/data/genomes/danRer11/bed/simpleRepeat cd /hive/data/genomes/danRer11/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409 3 danRer11) > do.log 2>&1 # real 14m55.497s cat fb.simpleRepeat # 108632714 bases of 1674677181 (6.487%) in intersection cd /hive/data/genomes/danRer11 # using the Window Masker result: #twoBitMask bed/windowMasker/danRer11.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed danRer11.2bit # you can safely ignore the warning about fields >= 13 # add to rmsk after it is done: twoBitMask danRer11.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed danRer11.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa danRer11.2bit stdout | faSize stdin > faSize.danRer11.2bit.txt cat faSize.danRer11.2bit.txt # 1679203469 bases (4693618 N's 1674509851 real 807728713 upper 866781138 lower) in 1923 sequences in 1 files # Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921 # %51.62 masked total, %51.76 masked real rm /gbdb/danRer11/danRer11.2bit ln -s `pwd`/danRer11.2bit /gbdb/danRer11/danRer11.2bit ######################################################################### # CREATE MICROSAT TRACK (DONE - 2017-06-04 - Chris) ssh hgwdev mkdir /cluster/data/danRer11/bed/microsat cd /cluster/data/danRer11/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed danRer11 microsat microsat.bed # Read 127131 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2017-06-07 - Chris) mkdir /hive/data/genomes/danRer11/bed/windowMasker cd /hive/data/genomes/danRer11/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev danRer11) > do.log 2>&1 # real 53m45.183s # Masking statistics cat faSize.danRer11.cleanWMSdust.txt # 1679203469 bases (4693618 N's 1674509851 real 844135283 upper 830374568 lower) in 1923 sequences in 1 files # Total size: mean 873220.7 sd 6186024.0 min 650 (chrUn_KN150525v1) max 78093715 (chr4) median 146921 # N count: mean 2440.8 sd 64874.0 # U count: mean 438967.9 sd 3081301.2 # L count: mean 431812.0 sd 3080639.1 # %49.45 masked total, %49.59 masked real cat fb.danRer11.rmsk.windowmaskerSdust.txt # 638473395 bases of 1679203469 (38.022%) in intersection ########################################################################## # run up idKeys files for ncbiRefSeq (DONE - 2017-03-28 - Chris) mkdir /hive/data/genomes/danRer11/bed/idKeys cd /hive/data/genomes/danRer11/bed/idKeys time (doIdKeys.pl -buildDir=`pwd` danRer11) > do.log 2>&1 & # real 12m20.939s cat danRer11.keySignature.txt # 8ae219619932349bebc17364408bf9d0 mkdir genbank time (doIdKeys.pl -buildDir=`pwd` -twoBit=`pwd`/genbank.2bit genbank) > do.log 2>&1 & cd ../ join danRer11.idKeys.txt genbank/genbank.idKeys.txt | awk '{print $2"\t"$3}' > idMap.txt ########################################################################## # cpgIslands - (DONE - 2016-10-20 - Hiram) mkdir /hive/data/genomes/danRer11/bed/cpgIslands cd /hive/data/genomes/danRer11/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku danRer11) > do.log 2>&1 # real 5m24.930s cat fb.danRer11.cpgIslandExt.txt # 5261052 bases of 1674677181 (0.314%) in intersection ############################################################################## # genscan - (DONE - 2017-06-07 - Chris) mkdir /hive/data/genomes/danRer11/bed/genscan cd /hive/data/genomes/danRer11/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku danRer11) > do.log 2>&1 # real 66m57.728s cat fb.danRer11.genscan.txt # 57461966 bases of 1674677181 (3.431%) in intersection cat fb.danRer11.genscanSubopt.txt # 45875776 bases of 1674677181 (2.739%) in intersection ############################################################################# # lastz/chain/net swap human/hg38 (DONE - 2017-03-30 - Chris) # original alignment mkdir /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 cd /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12 # 41036733 bases of 3049335806 (1.346%) in intersection mkdir /hive/data/genomes/danRer11/bed/blastz.hg38.swap cd /hive/data/genomes/danRer11/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzDanRer11.2017-06-12/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -noDbNameCheck -syntenicNet) > swap.log 2>&1 # real 39m24.916s cat fb.danRer11.chainHg38Link.txt # 47869194 bases of 1674677181 (2.858%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 hg38) \ > rbest.log 2>&1 & # real 638m45.337s ############################################################################# # lastz/chain/net swap mouse/mm10 (DONE - 2017-03-30 - Chris) cd /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 cat fb.mm10.chainDanRer11Link.txt # 36448414 bases of 2652783500 (1.374%) in intersection # and for the swap mkdir /hive/data/genomes/danRer11/bed/blastz.mm10.swap cd /hive/data/genomes/danRer11/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \ > swap.log 2>&1 & # real 53m19.485s cat fb.danRer11.chainMm10Link.txt # 90150612 bases of 1369865365 (6.581%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` danRer11 mm10) \ > rbest.log 2>&1 & # real 597m52.740s ######################################################################### # Create kluster run files (DONE - 2017-06-08 - Chris) # numerator is danRer11 gapless bases "real" as reported by: featureBits -noRandom -noHap danRer11 gap # 4323788 bases of 1340794641 (0.322%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 1340794641 / 2861349177 \) \* 1024 # ( 1340794641 / 2861349177 ) * 1024 = 479.834381 # ==> use -repMatch=500 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/danRer11 blat danRer11.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/danRer11.11.ooc \ -repMatch=500 # Wrote 57994 overused 11-mers to jkStuff/danRer11.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' danRer11 \ | sort -k7,7nr | ave -col=7 stdin # minimum gap size is 10 and produces a reasonable number of lifts gapToLift -verbose=2 -minGap=100 danRer11 jkStuff/nonBridged.lft \ -bedFile=jkStuff/nonBridged.bed ######################################################################## # GENBANK AUTO UPDATE (DONE - 2017-06-14 - Chris) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt # Danio rerio 30816 1488354 15648 # edit etc/genbank.conf to add danRer11 just before rheMac2 # danRer11 (zebrafish) danRer11.serverGenome = /hive/data/genomes/danRer11/danRer11.2bit danRer11.clusterGenome = /hive/data/genomes/danRer11/danRer11.2bit danRer11.ooc = /hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc danRer11.lift = /hive/data/genomes/danRer11/jkStuff/nonBridged.lft danRer11.perChromTables = no danRer11.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} danRer11.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} danRer11.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} danRer11.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} danRer11.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} # DO NOT NEED genbank.mrna.xeno except for human, mouse # defaults are fine: genbank.mrna.native refseq.mrna.native refseq.mrna.xeno yes # and genbank.est.native danRer11.genbank.mrna.xeno.load = yes danRer11.refseq.mrna.xeno.load = yes danRer11.downloadDir = danRer11 # danRer11.upstreamGeneTbl = refGene # danRer11.upstreamMaf = multiz7way # /hive/data/genomes/danRer10/bed/multiz7way/species.lst git commit -m "Added danRer11; refs #19459" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update screen # control this business with a screen since it takes a while cd /cluster/data/genbank time ./bin/gbAlignStep -initial danRer11 # logFile: var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log # real 2983m0.012s tail var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log # hgwdev 2017.06.09-15:24:14 danRer11.initalign: Succeeded: danRer11 # hgwdev 2017.06.09-15:32:33 danRer11.initalign: finish # To re-do, rm the dir first: # /cluster/data/genbank/work/initial.danRer11 # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad danRer11 tail -1 var/build/logs/2017.06.07-13:49:34.danRer11.initalign.log # hgwdev 2017.06.09-15:32:33 danRer11.initalign: finish # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add danRer11 to: # etc/align.dbs etc/hgwdev.dbs git add etc/align.dbs etc/hgwdev.dbs git commit -m "Added danRer11 - zebrafish refs #19459" etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################# # augustus gene track (DONE - 2017-06-09 - Chris) mkdir /hive/data/genomes/danRer11/bed/augustus cd /hive/data/genomes/danRer11/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=zebrafish -dbHost=hgwdev \ -workhorse=hgwdev danRer11) > do.log 2>&1 cat fb.danRer11.augustusGene.txt # 53437277 bases of 1674677181 (3.191%) in intersection ######################################################################### # add chromAlias table (DONE - 2017-10-20 - Hiram) mkdir /hive/data/genomes/danRer11/bed/chromAlias cd /hive/data/genomes/danRer11/bed/chromAlias hgsql -N -e 'select chrom,name,"refseq" from ucscToRefSeq;' danRer11 \ > ucsc.refseq.tab hgsql -N -e 'select chrom,name,"genbank" from ucscToINSDC;' danRer11 \ > ucsc.genbank.tab awk '{printf "%s\t%s\t%s\n", $2,$1,$3}' ucsc.genbank.tab ucsc.refseq.tab \ | sort > danRer11.chromAlias.tab # verify chrM is OK: grep chrM danRer11.chromAlias.tab # AC024175.3 chrM genbank # NC_002333.2 chrM refseq hgLoadSqlTab danRer11 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ danRer11.chromAlias.tab ######################################################################### # LIFTOVER TO danRer10 (DONE - 2017-10-25 - Hiram) ssh hgwdev mkdir /hive/data/genomes/danRer11/bed/blat.danRer10.2017-10-25 cd /hive/data/genomes/danRer11/bed/blat.danRer10.2017-10-25 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ danRer10 danRer10) > do.log 2>&1 # real 230m19.256s # verify the convert link on the test browser is now active from danRer11 to # danRer10 ######################################################################### # LIFTOVER TO danRer10 (DONE - 2017-06-12 - Chris) ssh hgwdev mkdir /hive/data/genomes/danRer11/bed/blat.danRer10.2017-06-09 cd /hive/data/genomes/danRer11/bed/blat.danRer10.2017-06-09 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \ danRer11 danRer10) > doLiftOverToDanRer10.log 2>&1 # real 86m43.038s # see if the liftOver menus function in the browser from danRer11 to danRer10 ######################################################################### # BLATSERVERS ENTRY (DONE - 2017-05-19 - Chris) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev # Haifang # Done. # Starting trans gfServer for danRer11 on host blat1a, port 17874 # Starting untrans gfServer for danRer11 on host blat1a, port 17875 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("danRer11", "blat1a", "17874", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("danRer11", "blat1a", "17875", "0", "1");' \ hgcentraltest # test it with some sequence ######################################################################### # all.joiner update, downloads and in pushQ - (DONE - 2017-06-09 - Chris) cd $HOME/kent/src/hg/makeDb/schema # fixup all.joiner until this is a clean output joinerCheck -database=danRer11 -tableCoverage all.joiner joinerCheck -database=danRer11 -times all.joiner joinerCheck -database=danRer11 -keys all.joiner cd /hive/data/genomes/danRer11 time (makeDownloads.pl danRer11) > downloads.log 2>&1 # real 27m1.207s # New Redmine pushQ System # now ready for pushQ entry mkdir /hive/data/genomes/danRer11/pushQ cd /hive/data/genomes/danRer11/pushQ time (makePushQSql.pl -redmineList danRer11) > danRer11.pushQ.sql 2> stderr.out #real 3m32.847s # Put a path to the files produced here in the redmine ticket ######################################################################### # UCSC to RefSeq name correspondence (DONE - 2017-10-20 - Hiram) mkdir /hive/data/genomes/danRer11/bed/ucscToRefSeq cd /hive/data/genomes/danRer11/bed/ucscToRefSeq # after constructing idKeys in ./refseq/ directory: join -t$'\t' ../idKeys/*.idKeys.txt refseq/*.idKeys.txt \ | cut -f2- | sort > ucsc.refseq.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab join ucsc.refseq.tab name.coordinate.tab \ | awk '{printf "%s\t%d\t%d\t%s\n", $1,$3,$4,$2}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.bed join -2 2 refseq.insdc.txt ucscToRefSeq.txt | tr '[ ]' '[\t]' | sort -k3 \ | join -2 3 name.coordinate.tab - | tr '[ ]' '[\t]' | cut -f1-4 \ > ucscToRefSeq.bed # when working perfectly, all these tab files have the same line count: wc -l *.tab *.bed # 1923 name.coordinate.tab # 1923 ucsc.refseq.tab # 1923 ucscToRefSeq.bed export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab danRer11 ucscToRefSeq ./ucscToRefSeq.sql ucscToRefSeq.bed checkTableCoords danRer11 -table=ucscToRefSeq # should cover %100 all bases: featureBits -countGaps danRer11 ucscToRefSeq # 1679203469 bases of 1679203469 (100.000%) in intersection ######################################################################### +# LIFTOVER TO danRer7 (DONE - 2022-08-04 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/danRer11/bed/blat.danRer7.2022-08-04 + cd /hive/data/genomes/danRer11/bed/blat.danRer7.2022-08-04 + time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ + -ooc=/hive/data/genomes/danRer11/jkStuff/danRer11.11.ooc \ + -target2Bit=/hive/data/genomes/danRer11/danRer11.2bit \ + -targetSizes=/hive/data/genomes/danRer11/chrom.sizes \ + -query2Bit=/hive/data/genomes/danRer7/danRer7.2bit \ + -querySizes=/hive/data/genomes/danRer7/chrom.sizes \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + danRer11 danRer7) > do.log 2>&1 + # real 230m19.256s + + # verify the convert link on the test browser is now active from danRer11 to + # danRer7 + +############################################################################## +# crispr whole genome (DONE - 2023-11-04 - Hiram) + # redmine issue 21863: https://redmine.soe.ucsc.edu/issues/21863 + + mkdir /hive/data/genomes/danRer11/bed/crisprAll + cd /hive/data/genomes/danRer11/bed/crisprAll + + # make sure it can get started + time (~/kent/src/hg/utils/automation/doCrispr.pl \ + -stop=guides -buildDir=`pwd` -smallClusterHub=hgwdev danRer11) \ + > guides.log 2>&1 + # real 37m57.115s + + sed -e 's/^/# /;' guides/run.time +# Completed: 99 of 99 jobs +# CPU time in finished jobs: 6868s 114.47m 1.91h 0.08d 0.000 y +# IO & Wait Time: 234s 3.89m 0.06h 0.00d 0.000 y +# Average job time: 72s 1.20m 0.02h 0.00d +# Longest finished job: 101s 1.68m 0.03h 0.00d +# Submission to last job: 102s 1.70m 0.03h 0.00d + + # looks good, let it run through the load: + time ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScoreJobList \ + -stop=load -buildDir=`pwd` -smallClusterHub=hgwdev danRer11) \ + > load.log 2>&1 + # real 2471m6.435s + + sed -e 's/^/# /;' specScores/run.time +# Completed: 721425 of 721425 jobs +# CPU time in finished jobs: 44390040s 739834.00m 12330.57h 513.77d 1.408 y +# IO & Wait Time: 872244s 14537.40m 242.29h 10.10d 0.028 y +# Average job time: 63s 1.05m 0.02h 0.00d +# Longest finished job: 148s 2.47m 0.04h 0.00d +# Submission to last job: 112835s 1880.58m 31.34h 1.31d + + sed -e 's/^/# /;' effScores/run.time +# Completed: 11644 of 11644 jobs +# CPU time in finished jobs: 5938619s 98976.98m 1649.62h 68.73d 0.188 y +# IO & Wait Time: 73259s 1220.99m 20.35h 0.85d 0.002 y +# Average job time: 516s 8.61m 0.14h 0.01d +# Longest finished job: 2250s 37.50m 0.62h 0.03d +# Submission to last job: 9850s 164.17m 2.74h 0.11d + + sed -e 's/^/# /;' offTargets/run.time +# Completed: 36072 of 36072 jobs +# CPU time in finished jobs: 492565s 8209.42m 136.82h 5.70d 0.016 y +# IO & Wait Time: 126854s 2114.23m 35.24h 1.47d 0.004 y +# Average job time: 17s 0.29m 0.00h 0.00d +# Longest finished job: 25s 0.42m 0.01h 0.00d +# Submission to last job: 1763s 29.38m 0.49h 0.02d + + # that made the table crispr10K and symlinks in /gbdb/danRer11/crisrp10K/ + # when it should have been instead crisprAll, reset the links and reload + # the correct table: +mkdir -p /gbdb/danRer11/crisprAll/ +rm -f /gbdb/danRer11/crisprAll/crispr.bb +rm -f /gbdb/danRer11/crisprAll/crisprDetails.tab +ln -sf `pwd`/crispr.bb /gbdb/danRer11/crisprAll/crispr.bb +ln -sf `pwd`/crisprDetails.tab /gbdb/danRer11/crisprAll/crisprDetails.tab +hgBbiDbLink danRer11 crisprAllTargets /gbdb/danRer11/crisprAll/crispr.bb + + hgsql -e 'drop table crispr10K;' danRer11 + + grep -c . effScores.tab + # 116454428 + grep -c . specScores.tab + # 53058151 + + time (cut -f1-3 crispr.bed | bedSingleCover.pl stdin \ + | awk '{sum+=$3-$2}END{printf "%d bases\n", sum}') \ + > coverage.crispr.bed.txt 2>&1 + 1145886525 bases + # real 5m14.538s + + ave -col=2 ../../*.sizes | grep total + total 1679203469.000000 + + # 'featureBits' result: + echo "scale+=3; 100.0 * 1145886525 / 1679203469" | bc + 68.239 + +##############################################################################