8a39a5228cc9e6e7e3c702123de693b305af04a8 hiram Wed Mar 31 12:56:33 2021 -0700 genbank run done and browser complete refs #26682 diff --git src/hg/makeDb/doc/rn7/initialBuild.txt src/hg/makeDb/doc/rn7/initialBuild.txt index c917d54..599f016 100644 --- src/hg/makeDb/doc/rn7/initialBuild.txt +++ src/hg/makeDb/doc/rn7/initialBuild.txt @@ -394,74 +394,64 @@ time (doIdKeys.pl \ -twoBit=/hive/data/genomes/rn7/rn7.unmasked.2bit \ -buildDir=`pwd` rn7) > do.log 2>&1 & # real 0m37.959s cat rn7.keySignature.txt # 40260b0b04686933dabe31bb3dff9f3e ############################################################################# # gapOverlap (DONE - 2020-02-03 - Hiram) mkdir /hive/data/genomes/rn7/bed/gapOverlap cd /hive/data/genomes/rn7/bed/gapOverlap time (doGapOverlap.pl \ -twoBit=/hive/data/genomes/rn7/rn7.unmasked.2bit rn7 ) \ > do.log 2>&1 & -XXX - running - Wed Feb 3 16:27:56 PST 2021 - # real 0m54.302s + # real 1m49.039s - # there were not very many gaps, it only had to do one job and blat - # found nothing. - - # this result does not exist: cat fb.rn7.gapOverlap.txt - # 608 bases of 2728222451 (0.000%) in intersection - - # manually finish off since it quit in the load step - doGapOverlap.pl -continue=cleanup \ - -twoBit=/hive/data/genomes/rn7/rn7.unmasked.2bit rn7 + # 3284 bases of 2647915728 (0.000%) in intersection ############################################################################# # tandemDups (DONE - 2020-02-03 - Hiram) mkdir /hive/data/genomes/rn7/bed/tandemDups cd /hive/data/genomes/rn7/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/rn7/rn7.unmasked.2bit rn7) \ > do.log 2>&1 & -XXX - running - Wed Feb 3 16:27:56 PST 2021 - # real 193m21.761s + # real 415m56.286s cat fb.rn7.tandemDups.txt - # 80358205 bases of 2897824427 (2.773%) in intersection + # 92889085 bases of 2647915728 (3.508%) in intersection bigBedInfo rn7.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 -# itemCount: 1,402,773 -# primaryDataSize: 36,657,311 -# primaryIndexSize: 119,132 +# itemCount: 1,172,090 +# primaryDataSize: 30,363,941 +# primaryIndexSize: 86,972 # zoomLevels: 9 -# chromCount: 894 -# basesCovered: 1,457,658,879 -# meanDepth (of bases covered): 8.428920 +# chromCount: 150 +# basesCovered: 1,472,207,966 +# meanDepth (of bases covered): 6.716224 # minDepth: 1.000000 -# maxDepth: 344.000000 -# std of depth: 18.274027 +# maxDepth: 279.000000 +# std of depth: 12.806697 ######################################################################### # ucscToINSDC and ucscToRefSeq table/track (DONE - 2021-02-03 - Hiram) # construct idKeys for the refseq and genbank sequence mkdir /hive/data/genomes/rn7/refseq/idKeys cd /hive/data/genomes/rn7/refseq/idKeys faToTwoBit ../G*BN7.2_genomic.fna.gz rn7.refseq.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/rn7.refseq.2bit refseqRn7) > do.log 2>&1 & # real 6m36.946s sed -e 's/^/ # /;' refseqRn7.keySignature.txt # 40260b0b04686933dabe31bb3dff9f3e mkdir /hive/data/genomes/rn7/genbank @@ -617,31 +607,30 @@ searchPriority 8 # verify searches work in the position box git commit -m 'adding search rule for gold/assembly track refs #26682' \ trackDb.ra ########################################################################## # running repeat masker (DONE - 2020-02-03 - Hiram) # using new repeat masker version 4.1.0 mkdir /hive/data/genomes/rn7/bed/repeatMasker cd /hive/data/genomes/rn7/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=hgwdev rn7) > do.log 2>&1 -XXX - running - Wed Feb 3 16:29:03 PST 2021 # real 397m24.001s cat faSize.rmsk.txt # 2647915728 bases (21334956 N's 2626580772 real 1457026804 upper # 1169553968 lower) in 176 sequences in 1 files # Total size: mean 15044975.7 sd 44833491.7 min 746 (chrUn_NW_023637828v1) # max 260522016 (chr1) median 44754 # %44.17 masked total, %44.53 masked real egrep -i "versi|relea" do.log # RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ # CC Dfam_Consensus RELEASE 20181026; * # CC RepBase RELEASE 20181026; sed -e 's/^/# /;' versionInfo.txt @@ -1054,43 +1043,77 @@ GCF_015227675.2_mRatBN7.2 rn7) > do.log 2>&1 & # real 6m52.120s cat fb.ncbiRefSeq.rn7.txt # 107770866 bases of 2626580772 (4.103%) in intersection # add: include ../../refSeqComposite.ra # to the rat/rn7/trackDb.ra to turn on the track in the browser joinerCheck says: rn7.ncbiRefSeqLink.protAcc - hits 74754 of 74755 (99.999%) Error: 1 of 74755 elements (0.001%) of rn7.ncbiRefSeqLink.protAcc are not in key ncbiRefSeqPepTable.name line 8640 of all.joiner Example miss: NP_536324.1 + # for some reason one of the proteins is missing from + # GCF_015227675.2_mRatBN7.2_protein.faa.gz + # however, it is in the GCF_015227675.2_mRatBN7.2_rna.gbff.gz file + # obtain it from that file, and create download/NP_536324.1.faa.gz + # then, add it to the table reload: + +export db=rn7 +export asmId=GCF_015227675.2_mRatBN7.2 + +zcat download/${asmId}_protein.faa.gz download/NP_536324.1.faa.gz \ + | sed -e 's/ .*//;' | faToTab -type=protein -keepAccSuffix stdin stdout \ + | sort | join -t$'\t' $db.ncbiRefSeqLink.protAcc.list - \ + > fixed.$db.ncbiRefSeqPepTable.tab + +hgLoadSqlTab $db ncbiRefSeqPepTable ~/kent/src/hg/lib/pepPred.sql \ + fixed.$db.ncbiRefSeqPepTable.tab - # XXX 2021-02-04 - ready for this after genbank runs +# Now both of these checks are quiet: +joinerCheck -keys \ + -identifier=ncbiRefSeqPepTable -database=$db \ + /cluster/home/hiram/kent/src/hg/makeDb/schema/all.joiner + +Checking keys on database rn7 + rn7.ncbiRefSeqLink.protAcc - hits 74755 of 74755 (100.000%) ok + +joinerCheck -keys \ + -identifier=ncbiRefSeq -database=$db \ + /cluster/home/hiram/kent/src/hg/makeDb/schema/all.joiner + +Checking keys on database rn7 + rn7.ncbiRefSeqLink.id - hits 99139 of 99139 (100.000%) ok + rn7.ncbiRefSeqCurated.name - hits 18403 of 18403 (100.000%) ok + rn7.ncbiRefSeqPredicted.name - hits 80736 of 80736 (100.000%) ok + rn7.ncbiRefSeqPsl.qName - hits 99145 of 99145 (100.000%) ok + rn7.ncbiRefSeqCds.id - hits 74741 of 74741 (100.000%) ok + rn7.seqNcbiRefSeq.acc - hits 99139 of 99139 (100.000%) ok featureBits -enrichment rn7 refGene ncbiRefSeq - # refGene 0.402%, ncbiRefSeq 3.148%, both 0.402%, cover 99.90%, enrich 31.73x + # refGene 1.522%, ncbiRefSeq 4.103%, both 1.520%, cover 99.87%, enrich 24.34x featureBits -enrichment rn7 ncbiRefSeq refGene - # ncbiRefSeq 3.148%, refGene 0.402%, both 0.402%, cover 12.76%, enrich 31.73x + # ncbiRefSeq 4.103%, refGene 1.522%, both 1.520%, cover 37.05%, enrich 24.34x featureBits -enrichment rn7 ncbiRefSeqCurated refGene - # ncbiRefSeqCurated 0.401%, refGene 0.402%, both 0.400%, cover 99.66%, enrich 247.79x + # ncbiRefSeqCurated 1.529%, refGene 1.522%, both 1.514%, cover 99.03%, enrich 65.07x featureBits -enrichment rn7 refGene ncbiRefSeqCurated - # refGene 0.402%, ncbiRefSeqCurated 0.401%, both 0.400%, cover 99.33%, enrich 247.79x + # refGene 1.522%, ncbiRefSeqCurated 1.529%, both 1.514%, cover 99.49%, enrich 65.07x ############################################################################## # LIFTOVER TO rn6 (DONE - 2021-02-04 - Hiram) ssh hgwdev mkdir /hive/data/genomes/rn7/bed/blat.rn6.2021-02-04 cd /hive/data/genomes/rn7/bed/blat.rn6.2021-02-04 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -query2Bit=/hive/data/genomes/rn6/rn6.2bit \ -querySizes=/hive/data/genomes/rn6/chrom.sizes \ -ooc=/hive/data/genomes/rn7/jkStuff/rn7.11.ooc \ rn7 rn6 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -query2Bit=/hive/data/genomes/rn6/rn6.2bit \ @@ -1110,31 +1133,31 @@ VALUES ("rn7", "blat1b", "17910", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("rn7", "blat1b", "17911", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################## ## reset default position to same as rn6 via blat of the DNA from rn6 ## (DONE - 2021-02-04 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr1:79348972-79379997" where name="rn7";' hgcentraltest ############################################################################## -# crispr whole genome (TBD - 2020-09-04 -> 2020-09-10 - Hiram) +# crispr whole genome (DONE - 2021-02-04 -> 2020-02-11 - Hiram) mkdir /hive/data/genomes/rn7/bed/crisprAll cd /hive/data/genomes/rn7/bed/crisprAll # need to have augustus genes done. This will not work with genscan # the large shoulder argument will cause the entire genome to be scanned # this takes a while for a new genome to get the bwa indexing done time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \ rn7 augustusGene -shoulder=250000000 -tableName=crisprAll \ -fileServer=hgwdev \ -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev) >> ranges.log 2>&1 # real 64m40.351s time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ @@ -1220,143 +1243,135 @@ # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 274,741,534 # primaryDataSize: 12,314,717,925 # primaryIndexSize: 17,246,852 # zoomLevels: 10 # chromCount: 31 # basesCovered: 2,157,342,037 # meanDepth (of bases covered): 2.929093 # minDepth: 1.000000 # maxDepth: 33.000000 # std of depth: 1.944611 ######################################################################### -# all.joiner update, downloads and in pushQ - (WORKING - 2019-11-20 - Hiram) +# all.joiner update, downloads and in pushQ - (DONE - 2021-03-31 - Hiram) # had incorrect orderKey specified in beginning rn6.config.ra # correct to 18019 by looking at the output of: # hgsql -e 'select name,organism,orderKey from dbDb order by orderKey;' \ # hgcentraltest | less # oryCun1 Rabbit 18010 # regenRn1 Rat 18020 # regenRn0 Rat 18021 # rn6 Rat 18031 # rn7 Rat 18032 # rn5 Rat 18032 # rn4 Rat 18033 # rn3 Rat 18034 # rn2 Rat 18035 # tauEry1 Red crested turaco 18360 hgsql -e 'update dbDb set orderKey=18019 where name="rn7";' hgcentraltest cd $HOME/kent/src/hg/makeDb/schema # verify all the business is done for release ~/kent/src/hg/utils/automation/verifyBrowser.pl rn7 -# 66 tables in database rn7 - Dog, Canis lupus familiaris -# verified 55 tables in database rn7, 11 extra tables, 14 optional tables +# 79 tables in database rn7 - Rat, Rattus norvegicus +# verified 77 tables in database rn7, 2 extra tables, 30 optional tables +# NCBI RefSeq genes 10 optional tables # chainNetRBestHg38 3 optional tables # chainNetRBestMm10 3 optional tables +# chainNetRBestMm39 3 optional tables # chainNetSynHg38 3 optional tables # chainNetSynMm10 3 optional tables +# chainNetSynMm39 3 optional tables # gapOverlap 1 optional tables # tandemDups 1 optional tables -# 1 chainCanFam3 - extra table -# 2 chainCanFam3Link - extra table -# 3 chainRBestCanFam3 - extra table -# 4 chainRBestCanFam3Link - extra table -# . . . etc . . . -# 8 crisprAllTargets - extra table -# 9 netCanFam3 - extra table -# 10 netRBestCanFam3 - extra table -# 11 netSynCanFam3 - extra table -# 13 genbank tables found -# verified 28 required tables, 1 missing tables -# 1 ucscToRefSeq - missing table +# 1 crisprAllRanges - extra table +# 2 crisprAllTargets - extra table +# 15 genbank tables found +# verified 32 required tables, 0 missing tables # hg38 chainNet to rn7 found 3 required tables # mm10 chainNet to rn7 found 3 required tables +# mm39 chainNet to rn7 found 3 required tables # hg38 chainNet RBest and syntenic to rn7 found 6 optional tables -# mm10 chainNet RBest and syntenic to rn7 found 3 optional tables +# mm10 chainNet RBest and syntenic to rn7 found 6 optional tables +# mm39 chainNet RBest and syntenic to rn7 found 6 optional tables # liftOver to previous versions: 1, from previous versions: 1 +# blatServers: rn7 blat1b 17910 1 0 0 rn7 blat1b 17911 0 1 0 # fixup all.joiner until this is a clean output joinerCheck -database=rn7 -tableCoverage all.joiner joinerCheck -database=rn7 -times all.joiner joinerCheck -database=rn7 -keys all.joiner # when clean, check in: git commit -m 'adding rules for rn7 refs #26682' all.joiner git push # run up a 'make alpha' in hg/hgTables to get this all.joiner file # into the hgwdev/genome-test system cd /hive/data/genomes/rn7 time (makeDownloads.pl rn7) > downloads.log 2>&1 - # real 20m11.930s + # real 19m6.278s # now ready for pushQ entry mkdir /hive/data/genomes/rn7/pushQ cd /hive/data/genomes/rn7/pushQ time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList rn7) > rn7.pushQ.sql 2> stderr.out - # real 13m21.313s + # real 18m30.674s # remove the tandemDups and gapOverlap from the file list: sed -i -e "/tandemDups/d" redmine.rn7.table.list sed -i -e "/Tandem Dups/d" redmine.rn7.releaseLog.txt sed -i -e "/gapOverlap/d" redmine.rn7.table.list sed -i -e "/Gap Overlaps/d" redmine.rn7.releaseLog.txt - # remove the multiz7way tables: - sed -i -e "/multiz7way/d" redmine.rn7.table.list - - # edit the file list and expand the wildcards: .../calJac*/... + # edit the file list and expand the directory wildcards, to find them: + grep "\*\/" redmine.rn7.file.list + # there were two, expand them via 'ls' # check for errors in stderr.out, some are OK, e.g.: -# redmine.rn7.releaseLog.txt WARNING: rn7 does not have seq -WARNING: hgwdev does not have phyloPng-generated /usr/local/apache/htdocs/images/phylo/rn7_7way.gif (or png) for multiz7way. +WARNING: rn7 does not have extFile WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of supporting and genbank tables) which tracks to assign these tables to: chainRBestHg38 chainRBestHg38Link - chainRBestMacFas5 - chainRBestMacFas5Link ... etc crisprAllRanges gbLoaded netRBestHg38 - netRBestMacFas5 netRBestMm10 netRBestMm39 netSynHg38 - netSynMacFas5 netSynMm10 netSynMm39 # verify the file list does correctly match to files cat redmine.rn7.file.list | while read L do eval ls $L > /dev/null done # should be silent, missing files will show as errors # verify database tables, how many to expect: wc -l redmine.rn7.table.list - # 70 redmine.rn7.table.list + # 65 redmine.rn7.table.list # how many actual: awk -F'.' '{printf "hgsql -N %s -e '"'"'show table status like \"%s\";'"'"'\n", $1, $2}' redmine.rn7.table.list | sh | wc -l - # 70 + # 65 # would be a smaller number actual if some were missing # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: # /hive/data/genomes/rn7/pushQ/redmine.rn7.file.list # /hive/data/genomes/rn7/pushQ/redmine.rn7.releaseLog.txt # /hive/data/genomes/rn7/pushQ/redmine.rn7.table.list #########################################################################