980a640fde4cdcd5aa34f24de2f71e51e3d0fe61 hiram Fri Feb 25 10:50:52 2022 -0800 stalled on cpgIslands refs #23367 diff --git src/hg/makeDb/doc/ambMex2/initialBuild.txt src/hg/makeDb/doc/ambMex2/initialBuild.txt index 13ce20c..ca0a7d3 100644 --- src/hg/makeDb/doc/ambMex2/initialBuild.txt +++ src/hg/makeDb/doc/ambMex2/initialBuild.txt @@ -59,31 +59,31 @@ ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_002915645.2 Primary Assembly # check assembly size for later reference: faSize G*v2_genomic.fna.gz # 32396370977 bases (4029676509 N's 28366694468 real 28365740082 upper # 954386 lower) in 98070 sequences in 1 files # Total size: mean 330339.3 sd 20104120.1 min 1033 (PGSH01113832.1) # max 2030161756 (CM010939.1) median 40921 # %0.00 masked total, %0.00 masked real # real 6m32.968s ############################################################################# -# establish config.ra file (TBD - Hiram - 2018-10-11) +# establish config.ra file (DONE - Hiram - 2019-04-09) cd /hive/data/genomes/ambMex2 ~/kent/src/hg/utils/automation/prepConfig.pl ambMex2 vertebrate axolotl \ genbank/*_assembly_report.txt > ambMex2.config.ra # compare with previous version to see if it is sane: diff ambMex2.config.ra ../ambMex1/ambMex1.config.ra # verify it really does look sane cat ambMex2.config.ra # config parameters for makeGenomeDb.pl: db ambMex2 clade vertebrate # genomeCladePriority 70 scientificName Ambystoma mexicanum commonName Axolotl @@ -96,31 +96,31 @@ fastaFiles /hive/data/genomes/ambMex2/ucsc/*.fa.gz agpFiles /hive/data/genomes/ambMex2/ucsc/*.agp # qualFiles none dbDbSpeciesDir axolotl photoCreditURL https://www.flickr.com/people/35871148@N04 photoCreditName Ruben Undheim/Flickr ncbiGenomeId 381 ncbiAssemblyId 2130471 ncbiAssemblyName ASM291563v2 ncbiBioProject 378970 ncbiBioSample SAMN06554622 genBankAccessionID GCA_002915635.2 taxId 8296 ############################################################################# -# setup UCSC named files (TBD - 2018-10-11 - Hiram) +# setup UCSC named files (DONE - 2019-03-26 - Hiram) mkdir /hive/data/genomes/ambMex2/ucsc cd /hive/data/genomes/ambMex2/ucsc # check for duplicate sequences: time faToTwoBit -long -noMask ../genbank/G*v2_genomic.fna.gz genbank.2bit # real 7m9.731s time twoBitDup genbank.2bit # real 2m3.641s # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this refseq.2bit file # remove it later @@ -617,35 +617,37 @@ -dbHost=hgwdev ambMex2) > do.log 2>&1 # real 1747m17.123s # Masking statistics cat faSize.ambMex2.cleanWMSdust.txt # 32396387346 bases (4029676509 N's 28366710837 real 703 upper 28366710134 # lower) in 98071 sequences in 1 files # Total size: mean 330336.1 sd 20104017.6 min 1033 (chrUn_PGSH01113832v1) # max 2030161756 (chr7) median 40920 # %87.56 masked total, %100.00 masked real cat fb.ambMex2.rmsk.windowmaskerSdust.txt # 18368939458 bases of 32396387346 (56.701%) in intersection ########################################################################## -# cpgIslands - (TBD - 2018-10-11 - Hiram) +# cpgIslands - (WORKING - 2018-10-11 - Hiram) mkdir /hive/data/genomes/ambMex2/bed/cpgIslands cd /hive/data/genomes/ambMex2/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku ambMex2) > do.log 2>&1 +XXX - running last18 manually on hgwdev 2020-12-11 - Hiram +something is too large MALLOC failure reqesting -2147483648 bytes - aborting # real 2m5.105s cat fb.ambMex2.cpgIslandExt.txt # 16395346 bases of 1055588482 (1.553%) in intersection ############################################################################## # genscan - (DONE - 2020-08-17 - Hiram) XXX - waiting for ku to return after power fails - Mon Aug 17 12:11:48 PDT 2020 mkdir /hive/data/genomes/ambMex2/bed/genscan cd /hive/data/genomes/ambMex2/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku ambMex2) > do.log 2>&1 # real 88m34.900s cat fb.ambMex2.genscan.txt