2986dc8f9e9b8e1d276f8dbc981309fba29d9016 hiram Mon Sep 14 16:39:05 2020 -0700 completed marked QA Ready refs #25917 diff --git src/hg/makeDb/doc/canFam5/initialBuild.txt src/hg/makeDb/doc/canFam5/initialBuild.txt index 6c82b4d..1e848ae 100644 --- src/hg/makeDb/doc/canFam5/initialBuild.txt +++ src/hg/makeDb/doc/canFam5/initialBuild.txt @@ -75,33 +75,33 @@ ## GCA_005444745.1 Primary Assembly ## GCA_005444775.1 non-nuclear # check assembly size for later reference: faSize G*1_genomic.fna.gz # 2343218756 bases (6087522 N's 2337131234 real 1588083192 upper # 749048042 lower) in 794 sequences in 1 files # Total size: mean 2951157.1 sd 13874454.0 min 1091 (REHQ01000052.1) # max 122894117 (CM016569.1) median 13386 # %31.97 masked total, %32.05 masked real # Survey types of gaps: -zcat *gaps.txt.gz | cut -f5 | sort | uniq -c - 1 gap_type - 999 within_scaffold +zgrep -v "^#" *gaps.txt.gz | cut -f5,6 | sort | uniq -c +# 274 within_scaffold align_genus +# 725 within_scaffold paired-ends # And total size in gaps: zcat *gaps.txt.gz | grep -v "^#" | awk '{print $3-$2+1}' | ave stdin \ | sed -e 's/^/# /;' # Q1 100.000000 # median 5000.000000 # Q3 5000.000000 # average 6093.603604 # min 19.000000 # max 144464.000000 # count 999 # total 6087510.000000 # standard deviation 11823.465922 ############################################################################# @@ -393,31 +393,31 @@ # real 1m49.489s # there only only nine: wc -l bed.tab # 9 bed.tab cut -f2- bed.tab chr1 41008264 41010364 chr1:41008265-41010364 1000 + 41008264 41010364 0 2 1000,1000 0,1100 chr17 58049274 58051374 chr17:58049275-58051374 1000 + 58049274 58051374 0 2 1000,1000 0,1100 ... etc ... chrX 45160089 45162189 chrX:45160090-45162189 1000 + 45160089 45162189 0 2 1000,1000 0,1100 cat fb.canFam5.gapOverlap.txt # 16158 bases of 2482000080 (0.001%) in intersection ############################################################################# -# tandemDups (TBD - 2020-03-31 - Hiram) +# tandemDups (DONE - 2020-07-17 - Hiram) mkdir /hive/data/genomes/canFam5/bed/tandemDups cd /hive/data/genomes/canFam5/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/canFam5/canFam5.unmasked.2bit canFam5) \ > do.log 2>&1 & # real 96m40.950s cat fb.canFam5.tandemDups.txt # 38911424 bases of 2343218756 (1.661%) in intersection bigBedInfo canFam5.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes @@ -445,83 +445,91 @@ -twoBit=`pwd`/canFam5.genbank.2bit genbankCanFam5) > do.log 2>&1 & # real 1m30.193s cat genbankCanFam5.keySignature.txt # 20a742890810f31eac281ae06bc3d170 mkdir /hive/data/genomes/canFam5/bed/chromAlias cd /hive/data/genomes/canFam5/bed/chromAlias join -t$'\t' ../idKeys/canFam5.idKeys.txt \ ../../genbank/idKeys/genbankCanFam5.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.bed -XXX - # should be same line counts throughout: wc -l * ../../chrom.sizes - # 2198 ucscToINSDC.bed - # 2198 ../../chrom.sizes + # 794 ucscToINSDC.bed + # 794 ../../chrom.sizes export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize - # 23 + # 20 # use the $chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab canFam5 ucscToINSDC stdin ucscToINSDC.bed # should be quiet for all OK checkTableCoords canFam5 # should cover %100 entirely: featureBits -countGaps canFam5 ucscToINSDC - # 2482000080 bases of 2482000080 (100.000%) in intersection + # 2343218756 bases of 2343218756 (100.000%) in intersection ######################################################################### -# add chromAlias table (TBD - 2020-05-20 - Hiram) +# add chromAlias table (DONE - 2020-07-29 - Hiram) mkdir /hive/data/genomes/canFam5/bed/chromAlias cd /hive/data/genomes/canFam5/bed/chromAlias - hgsql -N -e 'select chrom,name from ucscToRefSeq;' canFam5 \ - | sort -k1,1 > ucsc.refseq.tab hgsql -N -e 'select chrom,name from ucscToINSDC;' canFam5 \ | sort -k1,1 > ucsc.genbank.tab + grep -v "^#" ../../genbank/G*1_assembly_report.txt \ + | awk '{printf "%s\t%s\n", $5,$1}' | sort > insdc.assembly.txt + awk '{printf "%s\t%s\n", $4,$1}' ucscToINSDC.bed | sort > insdc.ucsc.txt + join insdc.assembly.txt insdc.ucsc.txt | awk '$2 != $3' \ + | awk '{printf "%s\t%s\n", $3,$2}' | sort > ucsc.assembly.tab + + wc -l *.tab ../../chrom.sizes + # 754 ucsc.assembly.tab + # 794 ucsc.genbank.tab + # 794 ../../chrom.sizes - wc -l *.tab - # 2198 ucsc.genbank.tab + # assembly counts are smaller since equivalence has been eliminated ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > canFam5.chromAlias.tab -for t in genbank +for t in genbank assembly do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t canFam5.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done -# checking genbank: 2198 =? 2198 OK +# checking genbank: 794 =? 794 OK +# checking assembly: 754 =? 754 OK # verify chrM is here properly: grep chrM canFam5.chromAlias.tab # CM022001.1 chrM genbank + # that genbank identifier does not yet have a RefSeq identifier + # otherwise would add a refseq.tab file for chrM hgLoadSqlTab canFam5 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ canFam5.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2020-07-17 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/dog/canFam5 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" canFam5 \ | sed -e 's/[0-9_.]\+//;' | sort | uniq -c 1037 CM 758 REHQ # implies a rule: '[CR][ME][HQ0-9]+(\.[0-9_]+)?' @@ -686,44 +694,45 @@ ############################################################################## # genscan - (DONE - 2020-07-28 - Hiram) mkdir /hive/data/genomes/canFam5/bed/genscan cd /hive/data/genomes/canFam5/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku canFam5) > do.log 2>&1 # real 43m47.630s # four jobs failed, running manually on hgwdev: ./runGsBig2M.csh chr22 000 gtf/000/chr22.gtf pep/000/chr22.pep subopt/000/chr22.bed & ./runGsBig2M.csh chr15 000 gtf/000/chr15.gtf pep/000/chr15.pep subopt/000/chr15.bed & ./runGsBig2M.csh chr20 000 gtf/000/chr20.gtf pep/000/chr20.pep subopt/000/chr20.bed & ./runGsBig2M.csh chr3 000 gtf/000/chr3.gtf pep/000/chr3.pep subopt/000/chr3.bed wait -XXX - running - Wed Jul 29 12:20:47 PDT 2020 + # real 23m28.061s + # continuing: time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -continue=makeBed -bigClusterHub=ku canFam5) > makeBed.log 2>&1 - # real 0m45.365s + # real 0m54.356s cat fb.canFam5.genscan.txt - # 57650331 bases of 2481941580 (2.323%) in intersection + # 55250288 bases of 2337131234 (2.364%) in intersection cat fb.canFam5.genscanSubopt.txt - # 50129491 bases of 2481941580 (2.020%) in intersection + # 48016592 bases of 2337131234 (2.055%) in intersection ######################################################################### -# Create kluster run files (TBD - 2020-04-02 - Hiram) +# Create kluster run files (DONE - 2020-07-28 - Hiram) # numerator is canFam5 gapless bases "real" as reported by: featureBits -noRandom -noHap canFam5 gap # 6036826 bases of 2320309602 (0.260%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2320309602 / 2861349177 \) \* 1024 # ( 2320309602 / 2861349177 ) * 1024 = 830.376471 # ==> use -repMatch=800 according to size scaled down from 1024 for human. # and rounded down to nearest 50 @@ -732,205 +741,246 @@ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/canFam5.11.ooc \ -repMatch=800 # Wrote 28510 overused 11-mers to jkStuff/canFam5.11.ooc # real 0m20.727s # canFam4 at repMatch=800: # Wrote 34718 overused 11-mers to jkStuff/canFam4.11.ooc # canFam3 at repMatch=900: # Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc # real 1m11.629s # there are no non-bridged gaps hgsql -N \ -e 'select * from gap where bridge="no" order by size;' canFam5 + hgsql -N -e 'select size from gap where bridge="no" order by size;' \ + canFam5 | sort | uniq -c | sort -k2,2n | sed -e 's/^/# /;' # survey gap sizes: hgsql -N -e 'select size from gap where bridge="yes" order by size;' \ canFam5 | ave stdin | sed -e 's/^/# /;' # Q1 100.000000 # median 5000.000000 # Q3 5000.000000 # average 6081.440559 # min 4.000000 # max 144464.000000 # count 1001 # total 6087522.000000 # standard deviation 11814.767347 + # and survey the bridged gaps over 5,000 bases: + hgsql -N -e 'select size from gap where bridge="yes" and size > 4999;' \ + canFam5 | sort | uniq -c | sort -k2,2n | sed -e 's/^/# /;' + # using ordinary gaps to make a lift file + # minimum gap size at 5000 produces a reasonable number of lifts + gapToLift -allowBridged -verbose=2 -minGap=5000 canFam5 \ + jkStuff/canFam5.5Kgaps.lft -bedFile=jkStuff/canFam5.5Kgaps.bed + wc -l jkStuff/ambMex* # minimum gap size at 10000 produces a reasonable number of lifts gapToLift -verbose=2 -minGap=10000 canFam5 jkStuff/canFam5.10Kgaps.lft \ -bedFile=jkStuff/canFam5.10Kgaps.bed wc -l jkStuff/*10K* # 794 jkStuff/canFam5.10Kgaps.bed # 794 jkStuff/canFam5.10Kgaps.lft + # to see the gaps used: + bedInvert.pl chrom.sizes jkStuff/canFam5.5Kgaps.bed | less + # and their sizes: + bedInvert.pl chrom.sizes jkStuff/canFam5.5Kgaps.bed \ + | cut -f4 | sort -n | uniq -c | less + ######################################################################## -# lastz/chain/net swap human/hg38 (TBD - 2020-04-10 - Hiram) +# lastz/chain/net swap human/hg38 (DONE - 2020-07-29 - Hiram) # original alignment - cd /hive/data/genomes/hg38/bed/lastzCanFam5.2020-04-02 + cd /hive/data/genomes/hg38/bed/lastzCanFam5.2020-07-29 cat fb.hg38.chainCanFam5Link.txt - # 1549397508 bases of 3110768607 (49.808%) in intersection + # 1545648756 bases of 3110768607 (49.687%) in intersection cat fb.hg38.chainSynCanFam5Link.txt - # 1488468205 bases of 3110768607 (47.849%) in intersection - - time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ - hg38 canFam5) > rbest.log 2>&1 & - # real 310m32.196s - + # 1484758745 bases of 3110768607 (47.730%) in intersection cat fb.hg38.chainRBest.CanFam5.txt - # 1425406620 bases of 3110768607 (45.822%) in intersection + # 1422619513 bases of 3110768607 (45.732%) in intersection # and for the swap: mkdir /hive/data/genomes/canFam5/bed/blastz.hg38.swap cd /hive/data/genomes/canFam5/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzCanFam5.2020-04-02/DEF \ + /hive/data/genomes/hg38/bed/lastzCanFam5.2020-07-29/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 - # real 99m10.990s + # real 78m37.078s cat fb.canFam5.chainHg38Link.txt - # 1493209286 bases of 2481941580 (60.163%) in intersection + # 1460025525 bases of 2337131234 (62.471%) in intersection cat fb.canFam5.chainSynHg38Link.txt - # 1448164376 bases of 2481941580 (58.348%) in intersection + # 1423305734 bases of 2337131234 (60.900%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ canFam5 hg38) > rbest.log 2>&1 & - # real 257m59.713s + # real 255m9.076s cat fb.canFam5.chainRBest.Hg38.txt - # 1425296830 bases of 2481941580 (57.427%) in intersection + # 1422612399 bases of 2337131234 (60.870%) in intersection -########################################################################### -# lastz/chain/net swap mouse/mm10 (TBD - 2020-04-20 - Hiram) +############################################################################ +# lastz/chain/net swap mouse/mm10 (DONE - 2020-07-29 - Hiram) # original alignment + cd /hive/data/genomes/mm10/bed/lastzCanFam5.2020-07-29 + cat fb.mm10.chainCanFam5Link.txt - # 777883731 bases of 2652783500 (29.323%) in intersection + # 776486006 bases of 2652783500 (29.271%) in intersection cat fb.mm10.chainSynCanFam5Link.txt - # 736602602 bases of 2652783500 (27.767%) in intersection - - time (doRecipBest.pl -load -workhorse=hgwdev mm10 canFam5 \ - -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & - # real 219m16.168s - + # 735561772 bases of 2652783500 (27.728%) in intersection cat fb.mm10.chainRBest.CanFam5.txt - # 741307883 bases of 2652783500 (27.945%) in intersection + # 740117947 bases of 2652783500 (27.900%) in intersection mkdir /hive/data/genomes/canFam5/bed/blastz.mm10.swap cd /hive/data/genomes/canFam5/bed/blastz.mm10.swap + time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/mm10/bed/lastzCanFam5.2020-04-02/DEF \ + /hive/data/genomes/mm10/bed/lastzCanFam5.2020-07-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & - # real 50m20.639s + # real 44m9.935s cat fb.canFam5.chainMm10Link.txt - # 772902855 bases of 2481941580 (31.141%) in intersection + # 759821061 bases of 2337131234 (32.511%) in intersection cat fb.canFam5.chainSynMm10Link.txt - # 737924732 bases of 2481941580 (29.732%) in intersection + # 731350605 bases of 2337131234 (31.293%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev canFam5 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & - # real 173m38.016s + # real 162m30.634s cat fb.canFam5.chainRBest.Mm10.txt - # 740357755 bases of 2481941580 (29.830%) in intersection + # 739177732 bases of 2337131234 (31.628%) in intersection + +############################################################################ +# lastz/chain/net swap mouse/mm39 (DONE - 2020-08-17 - Hiram) + + # original alignment + cd /hive/data/genomes/mm39/bed/lastzCanFam5.2020-08-17 + cat fb.mm39.chainCanFam5Link.txt + # 778327929 bases of 2654624157 (29.320%) in intersection + cat fb.mm39.chainSynCanFam5Link.txt + # 735515331 bases of 2654624157 (27.707%) in intersection + cat fb.mm39.chainRBest.CanFam5.txt + # 740738480 bases of 2654624157 (27.904%) in intersection + + mkdir /hive/data/genomes/canFam5/bed/blastz.mm39.swap + cd /hive/data/genomes/canFam5/bed/blastz.mm39.swap + time (doBlastzChainNet.pl -verbose=2 \ + /hive/data/genomes/mm39/bed/lastzCanFam5.2020-08-17/DEF \ + -swap -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & + # real 44m12.732s + + cat fb.canFam5.chainMm39Link.txt + # 762233776 bases of 2337131234 (32.614%) in intersection + cat fb.canFam5.chainSynMm39Link.txt + # 731337903 bases of 2337131234 (31.292%) in intersection + + time (doRecipBest.pl -load -workhorse=hgwdev canFam5 mm39 \ + -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & + # real 174m14.398s + + cat fb.canFam5.chainRBest.Mm39.txt + # 739648625 bases of 2337131234 (31.648%) in intersection ############################################################################## -# GENBANK AUTO UPDATE (TBD - 2020-04-09 - Hiram) +# GENBANK AUTO UPDATE (DONE - 2020-07-29 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # organism mrnaCnt estCnt refSeqCnt # Canis latrans 2 0 0 # Canis lupus 36 0 0 - # Canis lupus familiaris 3351 382644 1718 + # Canis lupus familiaris 3358 382639 1721 # Canis lupus laniger 2 0 0 # Canis lupus lupus 2 0 0 # Canis mesomelas 1 0 0 # Canis sp. 45 0 0 # the latrans is the Coyota, the mesomelas # is the Black-backed jackal from Africa and the langier is the Tibetan wolf # lupus lupus is the Eurasian wolf - # edit etc/genbank.conf to add canFam5 just after canFam3 + # edit etc/genbank.conf to add canFam5 just after canFam4 -# canFam5 (German shepard - GCA_011100685.1 - UU_Cfam_GSD_1.0) +# canFam5 (Great Dane - GCA_005444595.1 - UMICH_Zoey_3.1) canFam5.serverGenome = /hive/data/genomes/canFam5/canFam5.2bit canFam5.ooc = /hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc -canFam5.lift = /hive/data/genomes/canFam5/jkStuff/canFam5.nonBridged.lft +canFam5.lift = /hive/data/genomes/canFam5/jkStuff/canFam5.10Kgaps.lft canFam5.align.unplacedChroms = chrUn_* canFam5.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} canFam5.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} canFam5.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} canFam5.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} canFam5.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} canFam5.refseq.mrna.native.load = yes canFam5.refseq.mrna.xeno.load = yes # DO NOT NEED genbank.mrna.xeno except for human, mouse canFam5.genbank.mrna.xeno.load = yes canFam5.downloadDir = canFam5 canFam5.upstreamGeneTbl = refGene canFam5.perChromTables = no # verify the files specified exist before checking in the file: grep ^canFam5 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og -# -rw-rw-r-- 1 651703337 Apr 2 08:57 /hive/data/genomes/canFam5/canFam5.2bit -# -rw-rw-r-- 1 138880 Apr 2 09:51 /hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc -# -rw-rw-r-- 1 139818 Apr 2 09:56 /hive/data/genomes/canFam5/jkStuff/canFam5.nonBridged.lft +# -rw-rw-r-- 1 615551503 Jul 28 09:03 /hive/data/genomes/canFam5/canFam5.2bit +# -rw-rw-r-- 1 114048 Jul 28 09:17 /hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc +# -rw-rw-r-- 1 65851 Jul 31 12:34 /hive/data/genomes/canFam5/jkStuff/canFam5.5Kgaps.lft git commit -m "Added canFam5 dog; refs #25917" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add canFam5 to: # etc/hgwdev.dbs etc/align.dbs git commit -m "Added canFam5 - dog refs #25917" etc/hgwdev.dbs etc/align.dbs git push make etc-update - # wait a few days for genbank magic to take place, the tracks will - # appear + # Notify Chris Lee this is ready to go. Magic will happen. ############################################################################# -# augustus gene track (TBD - 2020-04-10 - Hiram) +# augustus gene track (DONE - 2020-07-29 - Hiram) mkdir /hive/data/genomes/canFam5/bed/augustus cd /hive/data/genomes/canFam5/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev \ -workhorse=hgwdev canFam5) > do.log 2>&1 - # real 74m39.734s + # real 189m35.455s cat fb.canFam5.augustusGene.txt - # 49999966 bases of 2481941580 (2.015%) in intersection + # 48256052 bases of 2337131234 (2.065%) in intersection ######################################################################### # ncbiRefSeq (TBD - 2019-11-20 - Hiram) ### XXX ### Not available on GCA/genbank assemblies mkdir /hive/data/genomes/canFam5/bed/ncbiRefSeq cd /hive/data/genomes/canFam5/bed/ncbiRefSeq # running step wise just to be careful time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Gorilla_gorilla \ GCA_008122165.1_Kamilah_GGO_v0 canFam5) > download.log 2>&1 # real 1m37.523s @@ -990,190 +1040,186 @@ mkdir /hive/data/genomes/canFam5/bed/blat.canFam3.2020-07-28 cd /hive/data/genomes/canFam5/bed/blat.canFam3.2020-07-28 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc \ canFam5 canFam3 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam5/jkStuff/canFam5.11.ooc \ canFam5 canFam3) > doLiftOverToCanFam3.log 2>&1 # real 278m52.252s # see if the liftOver menus function in the browser from canFam5 to canFam3 ######################################################################### -# BLATSERVERS ENTRY (TBD - 2020-04-02 - Hiram) +# BLATSERVERS ENTRY (DONE - 2020-07-31 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("canFam5", "blat1b", "17904", "1", "0"); \ + VALUES ("canFam5", "blat1b", "17906", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ - VALUES ("canFam5", "blat1b", "17905", "0", "1");' \ + VALUES ("canFam5", "blat1b", "17907", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ -## reset default position to gene: CDH2 upon recommendation from Kerstin -## (TBD - 2020-06-22 - Hiram) +## reset default position to gene: ACE2 as found by blat of human protein +## (DONE - 2020-07-31 - Hiram) ssh hgwdev - hgsql -e 'update dbDb set defaultPos="chr7:60683331-61003907" + hgsql -e 'update dbDb set defaultPos="chrX:11818981-11859716" where name="canFam5";' hgcentraltest ############################################################################## -# crispr whole genome (TBD - 2020-04-09 - Hiram) +# crispr whole genome (DONE - 2020-09-08 - Hiram) mkdir /hive/data/genomes/canFam5/bed/crisprAll cd /hive/data/genomes/canFam5/bed/crisprAll # the large shoulder argument will cause the entire genome to be scanned # this takes a while for a new genome to get the bwa indexing done time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \ - canFam5 genscan -shoulder=250000000 -tableName=crisprAll \ + canFam5 augustusGene -shoulder=250000000 -tableName=crisprAll \ -fileServer=hgwdev \ -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev) > ranges.log 2>&1 - # real 1m16.539s + # real 58m27.340s time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ - -continue=guides -stop=specScores canFam5 genscan \ + -continue=guides -stop=load canFam5 augustusGene \ -shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \ -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ - -workhorse=hgwdev) > specScores.log 2>&1 - # real 6558m26.295s + -workhorse=hgwdev) > load.log 2>&1 + # zreal 6831m11.040s cat guides/run.time | sed -e 's/^/# /;' # Completed: 100 of 100 jobs -# CPU time in finished jobs: 11979s 199.66m 3.33h 0.14d 0.000 y -# IO & Wait Time: 251s 4.18m 0.07h 0.00d 0.000 y -# Average job time: 122s 2.04m 0.03h 0.00d -# Longest finished job: 289s 4.82m 0.08h 0.00d -# Submission to last job: 303s 5.05m 0.08h 0.00d +# CPU time in finished jobs: 17641s 294.01m 4.90h 0.20d 0.001 y +# IO & Wait Time: 1178s 19.64m 0.33h 0.01d 0.000 y +# Average job time: 188s 3.14m 0.05h 0.00d +# Longest finished job: 356s 5.93m 0.10h 0.00d +# Submission to last job: 362s 6.03m 0.10h 0.00d cat specScores/run.time | sed -e 's/^/# /;' -# Completed: 3096565 of 3096565 jobs -# CPU time in finished jobs: 263946983s 4399116.38m 73318.61h 3054.94d 8.370 y -# IO & Wait Time: 17766691s 296111.52m 4935.19h 205.63d 0.563 y -# Average job time: 91s 1.52m 0.03h 0.00d -# Longest finished job: 851s 14.18m 0.24h 0.01d -# Submission to last job: 324649s 5410.82m 90.18h 3.76d +# Completed: 3079567 of 3079567 jobs +# CPU time in finished jobs: 249034274s 4150571.23m 69176.19h 2882.34d 7.897 y +# IO & Wait Time: 6571097s 109518.28m 1825.30h 76.05d 0.208 y +# Average job time: 83s 1.38m 0.02h 0.00d +# Longest finished job: 338s 5.63m 0.09h 0.00d +# Submission to last job: 288453s 4807.55m 80.13h 3.34d -# # Number of specScores: 233102255 - - ### remember to get back to hgwdev to run this - time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ - -continue=effScores -stop=load canFam5 genscan \ - -shoulder=250000000 -tableName=crisprAll -fileServer=hgwdev \ - -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ - -workhorse=hgwdev) > load.log 2>&1 - # real 932m13.229s + grep "Number of" load.log | grep Scores | grep "^#" +# Number of specScores: 231816384 +# Number of effScores: 252358865 cat effScores/run.time | sed -e 's/^/# /;' -# Completed: 25662 of 25662 jobs -# CPU time in finished jobs: 12763858s 212730.96m 3545.52h 147.73d 0.405 y -# IO & Wait Time: 144123s 2402.05m 40.03h 1.67d 0.005 y -# Average job time: 503s 8.38m 0.14h 0.01d -# Longest finished job: 4091s 68.18m 1.14h 0.05d -# Submission to last job: 15067s 251.12m 4.19h 0.17d +# Completed: 25231 of 25231 jobs +# CPU time in finished jobs: 12713218s 211886.96m 3531.45h 147.14d 0.403 y +# IO & Wait Time: 150199s 2503.32m 41.72h 1.74d 0.005 y +# Average job time: 510s 8.50m 0.14h 0.01d +# Longest finished job: 6617s 110.28m 1.84h 0.08d +# Submission to last job: 14126s 235.43m 3.92h 0.16d cat offTargets/run.time | sed -e 's/^/# /;' -# Completed: 154829 of 154829 jobs -# CPU time in finished jobs: 1805712s 30095.20m 501.59h 20.90d 0.057 y -# IO & Wait Time: 3128264s 52137.73m 868.96h 36.21d 0.099 y -# Average job time: 32s 0.53m 0.01h 0.00d -# Longest finished job: 273s 4.55m 0.08h 0.00d -# Submission to last job: 5337s 88.95m 1.48h 0.06d +# Completed: 153979 of 153979 jobs +# CPU time in finished jobs: 1739935s 28998.91m 483.32h 20.14d 0.055 y +# IO & Wait Time: 2672538s 44542.31m 742.37h 30.93d 0.085 y +# Average job time: 29s 0.48m 0.01h 0.00d +# Longest finished job: 53s 0.88m 0.01h 0.00d +# Submission to last job: 4617s 76.95m 1.28h 0.05d ######################################################################### # all.joiner update, downloads and in pushQ - (WORKING - 2019-11-20 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # verify all the business is done for release ~/kent/src/hg/utils/automation/verifyBrowser.pl canFam5 -# 66 tables in database canFam5 - Dog, Canis lupus familiaris -# verified 55 tables in database canFam5, 11 extra tables, 14 optional tables +# 71 tables in database canFam5 - Dog, Canis lupus familiaris +# verified 60 tables in database canFam5, 11 extra tables, 19 optional tables +# Ensembl genes 5 optional tables # chainNetRBestHg38 3 optional tables # chainNetRBestMm10 3 optional tables # chainNetSynHg38 3 optional tables # chainNetSynMm10 3 optional tables # gapOverlap 1 optional tables # tandemDups 1 optional tables -# 1 chainCanFam3 - extra table -# 2 chainCanFam3Link - extra table -# 3 chainRBestCanFam3 - extra table -# 4 chainRBestCanFam3Link - extra table +# 1 chainMm39 - extra table +# 2 chainMm39Link - extra table +# 3 chainRBestMm39 - extra table +# 4 chainRBestMm39Link - extra table # . . . etc . . . # 8 crisprAllTargets - extra table -# 9 netCanFam3 - extra table -# 10 netRBestCanFam3 - extra table -# 11 netSynCanFam3 - extra table +# 9 netMm39 - extra table +# 10 netRBestMm39 - extra table +# 11 netSynMm39 - extra table # 13 genbank tables found # verified 28 required tables, 1 missing tables # 1 ucscToRefSeq - missing table # hg38 chainNet to canFam5 found 3 required tables # mm10 chainNet to canFam5 found 3 required tables # hg38 chainNet RBest and syntenic to canFam5 found 6 optional tables # mm10 chainNet RBest and syntenic to canFam5 found 3 optional tables -# liftOver to previous versions: 1, from previous versions: 1 +# liftOver to previous versions: 2, from previous versions: 2 +# blatServers: canFam5 blat1b 17907 0 1 canFam5 blat1b 17906 1 0 # fixup all.joiner until this is a clean output joinerCheck -database=canFam5 -tableCoverage all.joiner joinerCheck -database=canFam5 -times all.joiner joinerCheck -database=canFam5 -keys all.joiner # when clean, check in: git commit -m 'adding rules for canFam5 refs #25917' all.joiner git push # run up a 'make alpha' in hg/hgTables to get this all.joiner file # into the hgwdev/genome-test system cd /hive/data/genomes/canFam5 time (makeDownloads.pl canFam5) > downloads.log 2>&1 - # real 16m11.233s + # real 15m31.624s # now ready for pushQ entry mkdir /hive/data/genomes/canFam5/pushQ cd /hive/data/genomes/canFam5/pushQ time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList canFam5) > canFam5.pushQ.sql 2> stderr.out - # real 15m2.385s + # real 11m11.758s # remove the tandemDups and gapOverlap from the file list: sed -i -e "/tandemDups/d" redmine.canFam5.table.list sed -i -e "/Tandem Dups/d" redmine.canFam5.releaseLog.txt sed -i -e "/gapOverlap/d" redmine.canFam5.table.list sed -i -e "/Gap Overlaps/d" redmine.canFam5.releaseLog.txt # check for errors in stderr.out, some are OK, e.g.: # WARNING: canFam5 does not have ucscToRefSeq # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqVersion.txt # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.bb # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.ix # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/ncbiRefSeqOther.ixx # WARNING: hgwdev does not have /gbdb/canFam5/ncbiRefSeq/seqNcbiRefSeq.rna.fa # WARNING: canFam5 does not have seq # WARNING: canFam5 does not have extFile # verify the file list does correctly match to files cat redmine.canFam5.file.list | while read L do eval ls $L > /dev/null done # should be silent, missing files will show as errors # verify database tables, how many to expect: wc -l redmine.canFam5.table.list - # 52 redmine.canFam5.table.list + # 57 redmine.canFam5.table.list # how many actual: awk -F'.' '{printf "hgsql -N %s -e '"'"'show table status like \"%s\";'"'"'\n", $1, $2}' redmine.canFam5.table.list | sh | wc -l - # 52 + # 57 # would be a smaller number actual if some were missing # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: # /hive/data/genomes/canFam5/pushQ/redmine.canFam5.file.list # /hive/data/genomes/canFam5/pushQ/redmine.canFam5.releaseLog.txt # /hive/data/genomes/canFam5/pushQ/redmine.canFam5.table.list #########################################################################