180fdd5b8ed0b1d93cde304e58816ee64eb9f7f0 hiram Mon Aug 17 12:22:20 2020 -0700 have 2bit masked with custom repeat library refs #23367 diff --git src/hg/makeDb/doc/ambMex2/initialBuild.txt src/hg/makeDb/doc/ambMex2/initialBuild.txt index d5a42df..13ce20c 100644 --- src/hg/makeDb/doc/ambMex2/initialBuild.txt +++ src/hg/makeDb/doc/ambMex2/initialBuild.txt @@ -1,974 +1,1061 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the ambMex2 # Can use existing photograph (otherwise find one before starting here) ######################################################################### # Initial steps, find photograph (DONE - 2019-03-26 - Hiram) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/ambMex2 cd ~/kent/src/hg/makeDb/doc/ambMex2 sed -e 's/rouAeg1/ambMex2/g; s/RouAeg1/AmbMex2/g; s/DONE/TBD/g;' \ ../galGal6/initialBuild.txt > initialBuild.txt mkdir -p /hive/data/genomes/ambMex2/genbank cd /hive/data/genomes/ambMex2 # Can use existing photograph cp -p ../ambMex1/photoReference.txt ./ cat photoReference.txt photoCreditURL https://www.flickr.com/people/35871148@N04 photoCreditName Ruben Undheim/Flickr ## download from NCBI cd /hive/data/genomes/ambMex2/genbank time rsync --stats -L -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_other/Ambystoma_mexicanum/all_assembly_versions/GCA_002915635.2_ASM291563v2/ ./ # real 8m10.720s # this information is from the top of # ambMex2/genbank/GCA_002915635.2_ASM291563v2_assembly_report.txt # Assembly name: ASM291563v2 # Organism name: Ambystoma mexicanum (axolotl) # Infraspecific name: strain=DD151 # Sex: male # Taxid: 8296 # BioSample: SAMN06554622 # BioProject: PRJNA378970 # Submitter: Max Planck Society/University of Kentucky # Date: 2018-12-04 # Assembly type: haploid # Release type: major # Assembly level: Chromosome # Genome representation: full # WGS project: PGSH01 # Assembly method: MARVEL v. 2016-10-10; Joinmap v. 4.1; AllMaps MAY-2018 # Expected final version: no # Genome coverage: 30.0x # Sequencing technology: PacBio # RefSeq category: Representative Genome # GenBank assembly accession: GCA_002915635.2 # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_002915645.2 Primary Assembly # check assembly size for later reference: faSize G*v2_genomic.fna.gz # 32396370977 bases (4029676509 N's 28366694468 real 28365740082 upper # 954386 lower) in 98070 sequences in 1 files # Total size: mean 330339.3 sd 20104120.1 min 1033 (PGSH01113832.1) # max 2030161756 (CM010939.1) median 40921 # %0.00 masked total, %0.00 masked real # real 6m32.968s ############################################################################# # establish config.ra file (TBD - Hiram - 2018-10-11) cd /hive/data/genomes/ambMex2 ~/kent/src/hg/utils/automation/prepConfig.pl ambMex2 vertebrate axolotl \ genbank/*_assembly_report.txt > ambMex2.config.ra # compare with previous version to see if it is sane: diff ambMex2.config.ra ../ambMex1/ambMex1.config.ra # verify it really does look sane cat ambMex2.config.ra # config parameters for makeGenomeDb.pl: db ambMex2 clade vertebrate # genomeCladePriority 70 scientificName Ambystoma mexicanum commonName Axolotl assemblyDate Dec. 2018 assemblyLabel Max Planck Society/University of Kentucky assemblyShortLabel ASM291563v2 orderKey 1943 # no mito sequence needed mitoAcc none fastaFiles /hive/data/genomes/ambMex2/ucsc/*.fa.gz agpFiles /hive/data/genomes/ambMex2/ucsc/*.agp # qualFiles none dbDbSpeciesDir axolotl photoCreditURL https://www.flickr.com/people/35871148@N04 photoCreditName Ruben Undheim/Flickr ncbiGenomeId 381 ncbiAssemblyId 2130471 ncbiAssemblyName ASM291563v2 ncbiBioProject 378970 ncbiBioSample SAMN06554622 genBankAccessionID GCA_002915635.2 taxId 8296 ############################################################################# # setup UCSC named files (TBD - 2018-10-11 - Hiram) mkdir /hive/data/genomes/ambMex2/ucsc cd /hive/data/genomes/ambMex2/ucsc # check for duplicate sequences: time faToTwoBit -long -noMask ../genbank/G*v2_genomic.fna.gz genbank.2bit # real 7m9.731s time twoBitDup genbank.2bit # real 2m3.641s # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this refseq.2bit file # remove it later time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../genbank/G*v2_genomic.fna.gz \ ../genbank/*_assembly_structure/Primary_Assembly CM010927.1 chr1P CM010928.1 chr1Q CM010929.1 chr2P CM010930.1 chr2Q CM010931.1 chr3P CM010932.1 chr3Q CM010933.1 chr4P CM010934.1 chr4Q CM010935.1 chr5P CM010936.1 chr5Q CM010937.1 chr6P CM010938.1 chr6Q CM010939.1 chr7 CM010940.1 chr8 CM010941.1 chr9 CM010942.1 chr10 CM010943.1 chr11 CM010944.1 chr12 CM010945.1 chr13 CM010946.1 chr14 real 96m6.237s time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../genbank/*_assembly_structure/Primary_Assembly # processed 98050 sequences into chrUn.fa.gz # real 82m20.093s # there are no unlocalized sequences time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ ../genbank/*_assembly_structure/Primary_Assembly # can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf at /cluster/home/hiram/kent/src/hg/utils/automation/unlocalizedWithChroms.pl line 23. # using mitochondrions NC_005797.1 to be specified on conf.ra file # verify fasta and AGPs agree time faToTwoBit -long *.fa.gz test.2bit # time cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 # All AGP and FASTA entries agree - both files are valid # real 2m51.784s XXX # and no sequence lost from orginal: twoBitToFa test.2bit stdout | faSize stdin # 1065365425 bases (9784466 N's 1055580959 real 1055580959 upper 0 lower) # in 464 sequences in 1 files # Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) # max 197608386 (chr1) median 10066 # same numbers as above (except for upper/lower masking) # 1065365425 bases (9784466 N's 1055580959 real 838536335 upper # 217044624 lower) in 464 sequences in 1 files # Total size: mean 2296046.2 sd 14494999.8 min 87 (NW_020109844.1) # max 197608386 (NC_006088.5) median 10066 # no longer need these temporary 2bit files rm test.2bit refseq.2bit ############################################################################# # Initial database build (DONE - 2019-04-12 - Hiram) # run this in debug mode so the jkStuff/makeUnmasked2bit.csh # script can be fixed up to add -long to the faToTwoBit command time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -debug -stop=seq ambMex2.config.ra) > seq.log 2>&1 # then, running the procedure: chmod +x jkStuff/*.csh ./jkStuff/getMito.csh time (./jkStuff/makeUnmasked2bit.csh ) >> seq.log 2>&1 & # real 24m36.006s # verify sequence and AGP are OK: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=agp -stop=agp ambMex2.config.ra) > agp.log 2>&1 # real 0m46.829s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db ambMex2.config.ra) > db.log 2>&1 # real 154m47.941s # trouble with the trackDb make, new file required in trackDb,fix the script time (~/kent/src/hg/utils/automation/makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=trackDb ambMex2.config.ra) > trackDb.log 2>&1 # real 0m12.044s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add ambMex2 to trackDb/makefile # temporary symlink until masked sequence is available cd /hive/data/genomes/ambMex2 ln -s `pwd`/ambMex2.unmasked.2bit /gbdb/ambMex2/ambMex2.2bit ############################################################################## # cpgIslands on UNMASKED sequence (TBD - 2018-10-11 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/cpgIslandsUnmasked cd /hive/data/genomes/ambMex2/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/ambMex2/ambMex2.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku ambMex2) > do.log 2>&1 XXX - running - Fri Apr 12 23:24:42 PDT 2019 +XXX - something is too large: +MALLOC failure reqesting -2147483648 bytes - aborting + # real 2m11.881s cat fb.ambMex2.cpgIslandExtUnmasked.txt # 27399280 bases of 1055588482 (2.596%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2019-04-12 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/cytoBand cd /hive/data/genomes/ambMex2/bed/cytoBand makeCytoBandIdeo.csh ambMex2 ############################################################################# -# run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2019-04-12 - Hiram) +# run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2019-04-15 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/idKeys cd /hive/data/genomes/ambMex2/bed/idKeys time (doIdKeys.pl \ -twoBit=/hive/data/genomes/ambMex2/ambMex2.unmasked.2bit \ -buildDir=`pwd` ambMex2) > do.log 2>&1 & -XXX - running - Fri Apr 12 23:26:32 PDT 2019 - # real 0m47.105s + # real 29m20.505s cat ambMex2.keySignature.txt - # 7850e2d5dabb6134fdc9d7083f1a3a54 + # 72abcdcc8a28b54cad2ff751c3494bed ############################################################################# -# gapOverlap (DONE - 2019-04-12 - Hiram) +# gapOverlap (DONE - 2019-04-15 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/gapOverlap cd /hive/data/genomes/ambMex2/bed/gapOverlap time (doGapOverlap.pl \ -twoBit=/hive/data/genomes/ambMex2/ambMex2.unmasked.2bit ambMex2 ) \ > do.log 2>&1 & -XXX - running - Fri Apr 12 23:26:32 PDT 2019 - # real 1m40.205s + # real 4m30.732s - # results are empty, there are none found. + # only a few: + wc -l bed.tab + # 64 bed.tab cat fb.ambMex2.gapOverlap.txt - # 97216 bases of 2615516299 (0.004%) in intersection + # 16776 bases of 32396387346 (0.000%) in intersection ############################################################################# # tandemDups (DONE - 2019-04-12 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/tandemDups cd /hive/data/genomes/ambMex2/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/ambMex2/ambMex2.unmasked.2bit ambMex2) \ > do.log 2>&1 & XXX - running - Fri Apr 12 23:26:32 PDT 2019 # real 97m29.383s cat fb.ambMex2.tandemDups.txt # 24887623 bases of 1065365425 (2.336%) in intersection bigBedInfo ambMex2.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 346,400 # primaryDataSize: 8,843,385 # primaryIndexSize: 38,860 # zoomLevels: 9 # chromCount: 407 # basesCovered: 114,644,428 # meanDepth (of bases covered): 21.207643 # minDepth: 1.000000 # maxDepth: 298.000000 # std of depth: 35.518221 ######################################################################### # ucscToINSDC and ucscToRefSeq table/track (TBD - 2018-10-11 - Hiram) # construct idKeys for the refseq sequence mkdir /hive/data/genomes/ambMex2/refseq/idKeys cd /hive/data/genomes/ambMex2/refseq/idKeys faToTwoBit ../G.*v2_genomic.fna.gz ambMex2.refSeq.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/ambMex2.refSeq.2bit refseqAmbMex2) > do.log 2>&1 & # real 0m48.786s cat refseqAmbMex2.keySignature.txt # 7850e2d5dabb6134fdc9d7083f1a3a54 # and the genbank sequence needs keys too: mkdir /hive/data/genomes/ambMex2/refseq/idKeysGenbank cd /hive/data/genomes/ambMex2/refseq/idKeysGenbank faToTwoBit /hive/data/outside/ncbi/genomes/genbank/vertebrate_other/Gallus_gallus/all_assembly_versions/GCA_000002315.5_GRCg6a/GCA_000002315.5_GRCg6a_genomic.fna.gz ambMex2.genbank.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/ambMex2.genbank.2bit genbankAmbMex2) > do.log 2>&1 & cat genbankAmbMex2.keySignature.txt # a20fdad3318d371fcb34fcc66bab3752 mkdir /hive/data/genomes/ambMex2/bed/chromAlias join -t$'\t' ../idKeys/ambMex2.idKeys.txt \ ../../refseq/idKeysGenbank/genbankAmbMex2.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.bed join -t$'\t' ../idKeys/ambMex2.idKeys.txt \ ../../refseq/idKeys/refseqAmbMex2.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.bed # should be same line counts throughout: wc -l * ../../chrom.sizes # 463 ucscToINSDC.bed # 464 ucscToRefSeq.bed # 464 ../../chrom.sizes # need to find the accession for the INSDC equivalent to chrM: egrep chrM * # ucscToRefSeq.bed:chrM 0 16775 NC_001323.1 # lookup that accession at NCBI Entrez: X52392.1 # and add to ucscToINSDC.bed: printf "chrM\t0\t16775\tX52392.1\n" >> ucscToINSDC.bed # verify: grep chrM * # ucsc.genbank.tab:chrM X52392.1 # ucsc.refseq.tab:chrM NC_001323.1 # ucscToINSDC.bed:chrM 0 16775 X52392.1 # ucscToRefSeq.bed:chrM 0 16775 NC_001323.1 export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 27 # use the $chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab ambMex2 ucscToINSDC stdin ucscToINSDC.bed # should be the same for ucscToRefSeq: export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 27 sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab ambMex2 ucscToRefSeq stdin ucscToRefSeq.bed # should be quiet for all OK checkTableCoords ambMex2 # should cover %100 entirely: featureBits -countGaps ambMex2 ucscToINSDC # 1065365425 bases of 1065365425 (100.000%) in intersection featureBits -countGaps ambMex2 ucscToRefSeq # 1065365425 bases of 1065365425 (100.000%) in intersection ######################################################################### # add chromAlias table (TBD - 2018-10-12 - ChrisL) mkdir /hive/data/genomes/ambMex2/bed/chromAlias cd /hive/data/genomes/ambMex2/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' ambMex2 \ | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' ambMex2 \ | sort -k1,1 > ucsc.genbank.tab ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16 join -t$'\t' ../idKeys/ambMex2.idKeys.txt \ ../../ens95/ensAmbMex2.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToEns.bed cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab wc -l *.bed # 2210 ucscToEns.bed # 2211 ucscToINSDC.bed # 2211 ucscToRefSeq.bed ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > ambMex2.chromAlias.tab for t in refseq genbank ensembl do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t ambMex2.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 464 =? 464 OK # checking genbank: 464 =? 464 OK # checking ensembl: 464 =? 464 OK hgLoadSqlTab ambMex2 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ ambMex2.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (TBD - 2018-10-11 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/chicken/ambMex2 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" ambMex2 \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c 1519 AADN.1 124 AC.1 313 AC.2 328 AC.3 74 AC.4 20 AC.5 1 AC.6 1 NC_.1 # implies a rule: '[AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" ambMex2 | wc -l # 2380 hgsql -N -e "select frag from gold;" ambMex2 \ | egrep -e '[AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)?' | wc -l # 2380 hgsql -N -e "select frag from gold;" ambMex2 \ | egrep -v -e '[AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)?' | wc -l # 0 # hence, add to trackDb/chicken/ambMex2/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (DONE - 2018-04-12 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/repeatMasker cd /hive/data/genomes/ambMex2/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku ambMex2) > do.log 2>&1 -XXX - running - Fri Apr 12 23:27:57 PDT 2019 - # real 48m25.181s + # real 216m7.175s cat faSize.rmsk.txt -# 1065365425 bases (9784466 N's 1055580959 real 922186059 upper -# 133394900 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %12.52 masked total, %12.64 masked real +# 32396387346 bases (4029676509 N's 28366710837 real 28112571951 upper +# 254138886 lower) in 98071 sequences in 1 files +# Total size: mean 330336.1 sd 20104017.6 min 1033 (chrUn_PGSH01113832v1) +# max 2030161756 (chr7) median 40920 +# %0.78 masked total, %0.90 masked real egrep -i "versi|relea" do.log - # RepeatMasker version open-4.0.7 - # February 01 2017 (open-4-0-7) 1.331 version of RepeatMasker - # CC Dfam_Consensus RELEASE 20170127; * - # CC RepBase RELEASE 20170127; +# RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ +# February 01 2017 (open-4-0-8) 1.332 version of RepeatMasker +# CC Dfam_Consensus RELEASE 20181026; * +# CC RepBase RELEASE 20181026; +XXX - this standard run is useless, note the custom library used next procedure time featureBits -countGaps ambMex2 rmsk # 133395265 bases of 1065365425 (12.521%) in intersection # real 0m4.226s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' ambMex2 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 133395265.000000 # real 0m3.198s -########################################################################## -# running simple repeat (DONE - 2019-04-12 - Hiram) +############################################################################### +# running repeat masker (DONE - 2020-06-19 - 2020-08-15 - Hiram) + # using a custom library from Jermiah Smith they developed with + # Repeat Modeller + + mkdir /hive/data/genomes/ambMex2/bed/repeatModeler + cd /hive/data/genomes/ambMex2/bed/repeatModeler + + # note the file used for customLib, this took almost two months running + # time with little interference on the ku kluster + + doRepeatMasker.pl -buildDir=`pwd` -customLib=`pwd`/LTRs_all_repeats.fa \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -smallClusterHub=hgwdev ambMex2 + cat run.cluster/run.time +# Completed: 65638 of 65638 jobs +# CPU time in finished jobs: 4047318392s 67455306.53m 1124255.11h 46843.96d 128.340 y +# IO & Wait Time: 11101559s 185025.99m 3083.77h 128.49d 0.352 y +# Average job time: 61830s 1030.51m 17.18h 0.72d +# Longest finished job: 77503s 1291.72m 21.53h 0.90d +# Submission to last job: 4811964s 80199.40m 1336.66h 55.69d + + # continuing after the kluster run is complete: + doRepeatMasker.pl -buildDir=`pwd` -customLib=`pwd`/LTRs_all_repeats.fa \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -continue=cat -smallClusterHub=hgwdev ambMex2 + # real 329m25.992s + + # much better result with this custom library: + cat faSize.rmsk.txt +# 32396387346 bases (4029676509 N's 28366710837 real 10003444277 upper +# 18363266560 lower) in 98071 sequences in 1 files +# Total size: mean 330336.1 sd 20104017.6 min 1033 (chrUn_PGSH01113832v1) +# max 2030161756 (chr7) median 40920 +# %56.68 masked total, %64.74 masked real + + egrep -i "versi|relea" do.log +# RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ +# CC Dfam_Consensus RELEASE 20181026; * +# CC RepBase RELEASE 20181026; * + + time featureBits -countGaps ambMex2 rmsk + # 18368951822 bases of 32396387346 (56.701%) in intersection + # real 4m34.562s + + # why is it different than the faSize above ? + # because rmsk masks out some N's as well as bases, the faSize count above + # separates out the N's from the bases, it doesn't show lower case N's + + # faster way to get the same result on high contig count assemblies: + time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' ambMex2 \ + | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" + # total 18368951822.000000 + # real 2m8.428s + +############################################################################### +# running simple repeat (DONE - 2019-04-15 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/simpleRepeat cd /hive/data/genomes/ambMex2/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409=6 ambMex2) > do.log 2>&1 -XXX - running - Fri Apr 12 23:28:56 PDT 2019 - # real 58m3.288s + # real 30m12.201s cat fb.simpleRepeat - # 31110690 bases of 1055588482 (2.947%) in intersection + # 1399134851 bases of 32393621946 (4.319%) in intersection cd /hive/data/genomes/ambMex2 - # using the Window Masker result: + # if using the Window Masker result: cd /hive/data/genomes/ambMex2 twoBitMask bed/windowMasker/ambMex2.cleanWMSdust.2bit \ -add bed/simpleRepeat/trfMask.bed ambMex2.2bit # you can safely ignore the warning about fields >= 13 - # add to rmsk after it is done: -# twoBitMask ambMex2.rmsk.2bit \ -# -add bed/simpleRepeat/trfMask.bed ambMex2.2bit + # or using RepeatMasker result add to rmsk after it is done: + twoBitMask ambMex2.rmsk.2bit \ + -add bed/simpleRepeat/trfMask.bed ambMex2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa ambMex2.2bit stdout | faSize stdin > faSize.ambMex2.2bit.txt cat faSize.ambMex2.2bit.txt -# 1065365425 bases (9784466 N's 1055580959 real 829559086 upper -# 226021873 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %21.22 masked total, %21.41 masked real +# 32396387346 bases (4029676509 N's 28366710837 real 9998218507 upper +# 18368492330 lower) in 98071 sequences in 1 files +# Total size: mean 330336.1 sd 20104017.6 min 1033 (chrUn_PGSH01113832v1) +# max 2030161756 (chr7) median 40920 +# %56.70 masked total, %64.75 masked real rm /gbdb/ambMex2/ambMex2.2bit ln -s `pwd`/ambMex2.2bit /gbdb/ambMex2/ambMex2.2bit ######################################################################### -# CREATE MICROSAT TRACK (TBD - 2018-10-11 - Hiram) +# CREATE MICROSAT TRACK (DONE - 2020-08-17 - Hiram) ssh hgwdev mkdir /cluster/data/ambMex2/bed/microsat cd /cluster/data/ambMex2/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed ambMex2 microsat microsat.bed - # Read 1745 elements of size 4 from microsat.bed + # Read 56937 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2019-04-15 - Hiram) - + # Odd result here, WM masked all but 703 bases ? mkdir /hive/data/genomes/ambMex2/bed/windowMasker cd /hive/data/genomes/ambMex2/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev ambMex2) > do.log 2>&1 -XXX - running - Mon Apr 15 22:55:39 PDT 2019 - # real 26m58.753s + # real 1747m17.123s # Masking statistics cat faSize.ambMex2.cleanWMSdust.txt -# 1065365425 bases (9784466 N's 1055580959 real 830149186 upper -# 225431773 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %21.16 masked total, %21.36 masked real +# 32396387346 bases (4029676509 N's 28366710837 real 703 upper 28366710134 +# lower) in 98071 sequences in 1 files +# Total size: mean 330336.1 sd 20104017.6 min 1033 (chrUn_PGSH01113832v1) +# max 2030161756 (chr7) median 40920 +# %87.56 masked total, %100.00 masked real cat fb.ambMex2.rmsk.windowmaskerSdust.txt - # 86091413 bases of 1065365425 (8.081%) in intersection + # 18368939458 bases of 32396387346 (56.701%) in intersection ########################################################################## # cpgIslands - (TBD - 2018-10-11 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/cpgIslands cd /hive/data/genomes/ambMex2/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku ambMex2) > do.log 2>&1 # real 2m5.105s cat fb.ambMex2.cpgIslandExt.txt # 16395346 bases of 1055588482 (1.553%) in intersection ############################################################################## -# genscan - (TBD - 2018-10-11 - Hiram) +# genscan - (DONE - 2020-08-17 - Hiram) +XXX - waiting for ku to return after power fails - Mon Aug 17 12:11:48 PDT 2020 mkdir /hive/data/genomes/ambMex2/bed/genscan cd /hive/data/genomes/ambMex2/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku ambMex2) > do.log 2>&1 # real 88m34.900s cat fb.ambMex2.genscan.txt # 23911678 bases of 1055588482 (2.265%) in intersection cat fb.ambMex2.genscanSubopt.txt # 24521608 bases of 1055588482 (2.323%) in intersection ######################################################################### -# Create kluster run files (TBD - 2018-10-11 - Hiram) +# Create kluster run files (DONE - 2020-08-17 - Hiram) # numerator is ambMex2 gapless bases "real" as reported by: featureBits -noRandom -noHap ambMex2 gap - # 9758843 bases of 1040397755 (0.938%) in intersection + # 2765400 bases of 27505544706 (0.010%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: - calc \( 1040397755 / 2861349177 \) \* 1024 - # ( 1040397755 / 2861349177 ) * 1024 = 372.330406 + calc \( 27505544706 / 2861349177 \) \* 1024 + # ( 27505544706 / 2861349177 ) * 1024 = 9843.495511 - # ==> use -repMatch=350 according to size scaled down from 1024 for human. - # and rounded down to nearest 50 + # ==> use -repMatch=9000 according to size scaled up from 1024 for human. + # and rounded down to nearest 1000 + # experiment with 9000, 8000, 7000 - using 7000 as it makes a + # reasonable number cd /hive/data/genomes/ambMex2 - blat ambMex2.2bit \ + time blat ambMex2.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/ambMex2.11.ooc \ - -repMatch=350 - # Wrote 18169 overused 11-mers to jkStuff/ambMex2.11.ooc - - # check non-bridged gaps to see what the typical size is: - hgsql -N \ - -e 'select * from gap where bridge="no" order by size;' ambMex2 \ - | sort -k7,7nr | ave -col=7 stdin - # minimum gap size is 10 and produces a reasonable number of lifts - gapToLift -verbose=2 -minGap=10 ambMex2 jkStuff/nonBridged.lft \ - -bedFile=jkStuff/nonBridged.bed - wc -l jkStuff/nonBri* - # 525 jkStuff/nonBridged.bed - # 525 jkStuff/nonBridged.lft + -repMatch=7000 + # real 4m11.198s + + # at repMatch 9000 + # Wrote 9042 overused 11-mers to jkStuff/ambMex2.11.ooc + # at repMatch 8000 + # Wrote 13163 overused 11-mers to jkStuff/ambMex2.11.ooc + # at repMatch 7000 + # Wrote 20332 overused 11-mers to jkStuff/ambMex2.11.ooc + + # there are no non-bridged gaps + hgsql -N -e 'select bridge from gap;' ambMex2 | sort | uniq -c + # 27654 yes + # survey gap sizes: + # all gaps are size 100 + hgsql -N -e 'select size from gap where bridge="yes" order by size;' \ + ambMex2 | ave stdin | sed -e 's/^/# /;' +# Q1 100.000000 +# median 100.000000 +# Q3 100.000000 +# average 100.000000 +# min 100.000000 +# max 100.000000 +# count 27654 +# total 2765400.000000 +# standard deviation 0.000000 + + # minimum gap size is 100: + gapToLift -verbose=2 -minGap=100 ambMex2 jkStuff/ambMex2.100baseGaps.lft \ + -allowBridged -bedFile=jkStuff/ambMex2.100baseGaps.bed + wc -l jkStuff/ambMex* + # 125725 jkStuff/ambMex2.100baseGaps.bed + # 125725 jkStuff/ambMex2.100baseGaps.lft + + # to see the gaps used: + bedInvert.pl chrom.sizes jkStuff/ambMex2.100baseGaps.bed | less + # and their sizes: + bedInvert.pl chrom.sizes jkStuff/ambMex2.100baseGaps.bed \ + | cut -f4 | sort -n | uniq -c | less + # 27654 100 ######################################################################## # lastz/chain/net swap human/hg38 (TBD - 2018-10-12 - Hiram) # original alignment cd /hive/data/genomes/hg38/bed/lastzAmbMex2.2018-10-12 cat fb.hg38.chainAmbMex2Link.txt # 154079940 bases of 3095998939 (4.977%) in intersection cat fb.hg38.chainSynAmbMex2Link.txt # 95877644 bases of 3095998939 (3.097%) in intersection cat fb.hg38.chainRBest.AmbMex2.txt # 106665747 bases of 3095998939 (3.445%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex2/bed/blastz.hg38.swap cd /hive/data/genomes/ambMex2/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzAmbMex2.2018-10-12/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 9m45.514s cat fb.ambMex2.chainHg38Link.txt # 120955955 bases of 1055588482 (11.459%) in intersection cat fb.ambMex2.chainSynHg38Link.txt # 92597630 bases of 1055588482 (8.772%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex2 hg38) > rbest.log 2>&1 & # real 139m24.408s cat fb.ambMex2.chainRBest.Hg38.txt # 106294585 bases of 1055588482 (10.070%) in intersection ######################################################################### # lastz/chain/net swap mouse/mm10 (TBD - 2018-10-12 - Hiram) # original alignment cd /hive/data/genomes/mm10/bed/lastzAmbMex2.2018-10-12 cat fb.mm10.chainAmbMex2Link.txt # 101151132 bases of 2652783500 (3.813%) in intersection cat fb.mm10.chainSynAmbMex2Link.txt # 70707720 bases of 2652783500 (2.665%) in intersection cat fb.mm10.chainRBest.AmbMex2.txt # 79649474 bases of 2652783500 (3.002%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex2/bed/blastz.mm10.swap cd /hive/data/genomes/ambMex2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAmbMex2.2018-10-12/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 6m41.043s cat fb.ambMex2.chainMm10Link.txt # 88539346 bases of 1055588482 (8.388%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex2 mm10) > rbest.log 2>&1 & # real 94m11.007s cat fb.ambMex2.chainRBest.Mm10.txt # 79474812 bases of 1055588482 (7.529%) in intersection ######################################################################### # GENBANK AUTO UPDATE (TBD - 2018-10-12 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: - # #organism mrnaCnt estCnt refSeqCnt - # Gallus gallus 30708 600485 6392 + # organism mrnaCnt estCnt refSeqCnt + # Ambystoma mexicanum 7749 43323 0 - # edit etc/genbank.conf to add ambMex2 just before galGal5 + # edit etc/genbank.conf to add ambMex2 just before ambMex2 -# ambMex2 (chicken/GCF_000002315.5_GRCg6a) +# ambMex2 (Axolotl - Ambystoma mexicanum) GCA_002915635.2 - 30Gb total ambMex2.serverGenome = /hive/data/genomes/ambMex2/ambMex2.2bit -ambMex2.clusterGenome = /hive/data/genomes/ambMex2/ambMex2.2bit ambMex2.ooc = /hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc -ambMex2.lift = /hive/data/genomes/ambMex2/jkStuff/nonBridged.lft +ambMex2.lift = /hive/data/genomes/ambMex2/jkStuff/ambMex2.100baseGaps.lft ambMex2.perChromTables = no -ambMex2.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} -ambMex2.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} -ambMex2.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} -ambMex2.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} -ambMex2.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} -ambMex2.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} -ambMex2.refseq.mrna.native.load = yes -ambMex2.refseq.mrna.xeno.load = yes -ambMex2.genbank.mrna.xeno.load = yes ambMex2.downloadDir = ambMex2 -# ambMex2.upstreamGeneTbl = refGene -# ambMex2.upstreamMaf = multiz7way /hive/data/genomes/galGal4/bed/multiz7way/species.lst +ambMex2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} +ambMex2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} +ambMex2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} +ambMex2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} +ambMex2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} +ambMex2.genbank.est.xeno.pslCDnaFilter = ${ordered.genbank.est.xeno.pslCDnaFilter} +# defaults yes: genbank.mrna.native.load genbank.mrna.native.loadDesc +# yes: genbank.est.native.load refseq.mrna.native.load +# yes: refseq.mrna.native.loadDesc refseq.mrna.xeno.load +# yes: refseq.mrna.xeno.loadDesc +# defaults no: genbank.mrna.xeno.load genbank.mrna.xeno.loadDesc +# no: genbank.est.native.loadDesc genbank.est.xeno.load +# no: genbank.est.xeno.loadDesc +# DO NOT NEED genbank.mrna.xeno except for human, mouse +# ambMex2.upstreamGeneTbl = ensGene +# ambMex2.upstreamMaf = multiz6way /hive/data/genomes/ambMex2/bed/multiz6way/species.list # verify the files specified exist before checking in the file: grep ^ambMex2 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og -# -rw-rw-r-- 1 313201328 Oct 11 15:51 /hive/data/genomes/ambMex2/ambMex2.2bit -# -rw-rw-r-- 1 313201328 Oct 11 15:51 /hive/data/genomes/ambMex2/ambMex2.2bit -# -rw-rw-r-- 1 72684 Oct 11 15:56 /hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc -# -rw-rw-r-- 1 29513 Oct 11 15:57 /hive/data/genomes/ambMex2/jkStuff/nonBridged.lft +-rw-rw-r-- 1 8271637678 Aug 17 10:51 /hive/data/genomes/ambMex2/ambMex2.2bit +-rw-rw-r-- 1 7002521 Aug 17 12:05 /hive/data/genomes/ambMex2/jkStuff/ambMex2.100baseGaps.lft +-rw-rw-r-- 1 81336 Aug 17 11:46 /hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc - git commit -m "Added ambMex2; refs #22113" etc/genbank.conf + git commit -m "Added ambMex2; refs #23367" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add ambMex2 to: # etc/align.dbs etc/hgwdev.dbs - git add etc/align.dbs etc/hgwdev.dbs - git commit -m "Added ambMex2 - chicken refs #22113" etc/hgwdev.dbs + git commit -m "Added ambMex2 refs #23367" etc/hgwdev.dbs etc/align.dbs git push make etc-update - # wait a few days for genbank magic to take place, the tracks will - # appear + # Notify Chris this is ready to go 2020-08-17 ############################################################################# # augustus gene track (TBD - 2018-10-12 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/augustus cd /hive/data/genomes/ambMex2/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=chicken -dbHost=hgwdev \ -workhorse=hgwdev ambMex2) > do.log 2>&1 # real 48m48.597s cat fb.ambMex2.augustusGene.txt # 25827925 bases of 1055588482 (2.447%) in intersection ######################################################################### # ncbiRefSeq (TBD - 2018-10-12 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/ncbiRefSeq cd /hive/data/genomes/ambMex2/bed/ncbiRefSeq # running step wise just to be careful time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_other Gallus_gallus \ GCF_000002315.5_GRCg6a ambMex2) > download.log 2>&1 # real 1m19.029s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_other Gallus_gallus \ GCF_000002315.5_GRCg6a ambMex2) > process.log 2>&1 # real 2m6.030s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -stop=load -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_other Gallus_gallus \ GCF_000002315.5_GRCg6a ambMex2) > load.log 2>&1 # real 0m22.312s cat fb.ncbiRefSeq.ambMex2.txt # 88641893 bases of 1055588482 (8.397%) in intersection # need to add: include ../../refSeqComposite.ra alpha # to the chicken/ambMex2/trackDb.ra to turn on the track in the browser # there was one gene that claimed to have a protein, but the # protein sequence was not included in the protein.faa file # discovered from joinerCheck # manual fix to blank out this one protein, to see the entry hgsql -e 'select * from ncbiRefSeqLink where protAcc="NP_989875.1";' ambMex2 hgsql -e 'update ncbiRefSeqLink set protAcc="" where protAcc="NP_989875.1";' ambMex2 # this makes the 'protein' link disappear from the gene details page # curious that this gene is marked as a non-coding gene anyway ? # gene: FET1 at chr4:63,102,774-63,105,516- featureBits -enrichment ambMex2 refGene ncbiRefSeq # refGene 1.374%, ncbiRefSeq 8.397%, both 1.370%, cover 99.73%, enrich 11.88x featureBits -enrichment ambMex2 ncbiRefSeq refGene # ncbiRefSeq 8.397%, refGene 1.374%, both 1.370%, cover 16.32%, enrich 11.88x featureBits -enrichment ambMex2 ncbiRefSeqCurated refGene # ncbiRefSeqCurated 1.368%, refGene 1.374%, both 1.364%, cover 99.71%, enrich 72.59x featureBits -enrichment ambMex2 refGene ncbiRefSeqCurated # refGene 1.374%, ncbiRefSeqCurated 1.368%, both 1.364%, cover 99.32%, enrich 72.59x ######################################################################### # LIFTOVER TO galGal5 (TBD - 2018-10-11 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ambMex2/bed/blat.galGal5.2018-10-11 cd /hive/data/genomes/ambMex2/bed/blat.galGal5.2018-10-11 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc \ ambMex2 galGal5 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc \ ambMex2 galGal5) > doLiftOverToGalGal5.log 2>&1 # real 156m30.215s # see if the liftOver menus function in the browser from ambMex2 to galGal5 ######################################################################### # LIFTOVER TO galGal4 (TBD - 2018-10-12 - Hiram) ssh hgwdev mkdir /hive/data/genomes/ambMex2/bed/blat.galGal4.2018-10-12 cd /hive/data/genomes/ambMex2/bed/blat.galGal4.2018-10-12 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc \ ambMex2 galGal4 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/ambMex2/jkStuff/ambMex2.11.ooc \ ambMex2 galGal4) > doLiftOverToGalGal4.log 2>&1 & # real 36m10.254s # see if the liftOver menus function in the browser from ambMex2 to galGal5 ######################################################################### # BLATSERVERS ENTRY (TBD - 2018-10-12 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ambMex2", "blat1a", "17892", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("ambMex2", "blat1a", "17893", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to MEPE gene (egg shell protein) ## (TBD - 2018-10-12 - Hiram) # as found from the galGal5 to ambMex2 liftOver ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr4:45667017-45672928" where name="ambMex2";' hgcentraltest ######################################################################### # crispr 10K shoulders (TBD - 2018-10-16 - Hiram) # working on this script, adding the indexFa step: time (~/kent/src/hg/utils/automation/doCrispr.pl \ -stop=indexFa -buildDir=`pwd` -smallClusterHub=ku ambMex2 ncbiRefSeq) \ > indexFa.log 2>&1 # real 23m26.694s time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=ranges -stop=guides -buildDir=`pwd` -smallClusterHub=ku \ ambMex2 ncbiRefSeq) > guides.log 2>&1 # real 2m50.758s # adding the /dev/shm/ setup rsync for the indexed Fa # performed manually to work out the procedure time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=specScores -stop=specScores -buildDir=`pwd` \ -smallClusterHub=ku ambMex2 ncbiRefSeq) > specScores.log # had about half of ku for about half this time: # Completed: 884922 of 884922 jobs # CPU time in finished jobs: 35872791s 597879.85m 9964.66h 415.19d 1.138 y # IO & Wait Time: 899261s 14987.69m 249.79h 10.41d 0.029 y # Average job time: 42s 0.69m 0.01h 0.00d # Longest finished job: 88s 1.47m 0.02h 0.00d # Submission to last job: 48045s 800.75m 13.35h 0.56d time find tmp/outGuides -type f | xargs cut -f3-6 > ../specScores.tab # real 236m17.220s wc -l specScores.tab # 66451712 specScores.tab time (~/kent/src/hg/utils/automation/doCrispr.pl \ -continue=effScores -stop=load \ -buildDir=`pwd` -smallClusterHub=ku ambMex2 ncbiRefSeq) \ > load.log # real 307m41.143s ######################################################################### # all.joiner update, downloads and in pushQ - (TBD - 2018-10-17 - Hiram) xyz cd $HOME/kent/src/hg/makeDb/schema # verify all the business is done for release ~/kent/src/hg/utils/automation/verifyBrowser.pl ambMex2 # fixup all.joiner until this is a clean output joinerCheck -database=ambMex2 -tableCoverage all.joiner joinerCheck -database=ambMex2 -times all.joiner joinerCheck -database=ambMex2 -keys all.joiner # when clean, check in: - git commit -m 'adding rules for ambMex2 refs #22113' all.joiner + git commit -m 'adding rules for ambMex2 refs #23367' all.joiner git push # run up a 'make alpha' in hg/hgTables to get this all.joiner file # into the hgwdev/genome-test system cd /hive/data/genomes/ambMex2 time (makeDownloads.pl ambMex2) > downloads.log 2>&1 # real 10m7.605s # now ready for pushQ entry mkdir /hive/data/genomes/ambMex2/pushQ cd /hive/data/genomes/ambMex2/pushQ time (makePushQSql.pl -redmineList ambMex2) > ambMex2.pushQ.sql 2> stderr.out # real 9m58.779s # remove the extra chainNet files from the listings: sed -i -e "/etNig1/d" redmine.ambMex2.file.list sed -i -e "/asAcu1/d" redmine.ambMex2.file.list sed -i -e "/etNig1/d" redmine.ambMex2.table.list sed -i -e "/onAlb1/d" redmine.ambMex2.table.list sed -i -e "/asAcu1/d" redmine.ambMex2.table.list sed -i -e "/Stickleback/d" redmine.ambMex2.releaseLog.txt sed -i -e "/Tetraodon/d" redmine.ambMex2.releaseLog.txt sed -i -e "/sparrow/d" redmine.ambMex2.releaseLog.txt # remove the tandemDups and gapOverlap from the file list: sed -i -e "/tandemDups/d" redmine.ambMex2.table.list sed -i -e "/Tandem Dups/d" redmine.ambMex2.releaseLog.txt sed -i -e "/gapOverlap/d" redmine.ambMex2.table.list sed -i -e "/Gap Overlaps/d" redmine.ambMex2.releaseLog.txt # real 7m21.629s # check for errors in stderr.out, some are OK, e.g.: # WARNING: hgwdev does not have /gbdb/ambMex2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/ambMex2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/ambMex2/bbi/quality.bw # WARNING: ambMex2 does not have seq # WARNING: ambMex2 does not have extFile # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: # /hive/data/genomes/ambMex2/pushQ/redmine.ambMex2.file.list # /hive/data/genomes/ambMex2/pushQ/redmine.ambMex2.releaseLog.txt # /hive/data/genomes/ambMex2/pushQ/redmine.ambMex2.table.list #########################################################################