d8751b5445ffe3667e72aeb5041b724a1877a392 hiram Mon Sep 9 10:00:16 2019 -0700 initial build is sufficient for Mark.s testing refs #22271 diff --git src/hg/makeDb/doc/GRCm38B/initialBuild.txt src/hg/makeDb/doc/GRCm38B/initialBuild.txt index 252b925..23c7517 100644 --- src/hg/makeDb/doc/GRCm38B/initialBuild.txt +++ src/hg/makeDb/doc/GRCm38B/initialBuild.txt @@ -280,187 +280,148 @@ -twoBit=/hive/data/genomes/GRCm38B/GRCm38B.unmasked.2bit GRCm38B ) \ > do.log 2>&1 & # real 1m3.608s # there are only 3 items cut -f2-5 bed.tab chr15 8995648 9001806 chr15:8995649-9001806 chr6 47665147 47715755 chr6:47665148-47715755 chrX_GPS_017319198v1_random 240296 243009 chrX_GPS_017319198v1_random:240297-243009 cat fb.GRCm38B.gapOverlap.txt # 1720 bases of 2729890658 (0.000%) in intersection ############################################################################# -# tandemDups (TBD - 2018-10-12 - Hiram) +# tandemDups (DONE - 2018-10-12 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/tandemDups cd /hive/data/genomes/GRCm38B/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/GRCm38B/GRCm38B.unmasked.2bit GRCm38B) \ > do.log 2>&1 & -XXX - running - Thu Feb 28 14:39:26 PST 2019 # real 97m29.383s + # one job took too much memory, run on hgwdev: + cd /hive/data/genomes/GRCm38B/bed/tandemDups/pairedEnds + time (./runOne 29 20000 chrY tmp/chrY.bed.gz) > lastOne.log 2>&1 + # real 28m40.324s + + # then continuing: + time (~/kent/src/hg/utils/automation/doTandemDup.pl -continue=collapsePairedEnds \ + -twoBit=/hive/data/genomes/GRCm38B/GRCm38B.unmasked.2bit GRCm38B) \ + > collapsePairedEnds.log 2>&1 & + # real 3m24.436s + cat fb.GRCm38B.tandemDups.txt - # 24887623 bases of 1065365425 (2.336%) in intersection + # 66541045 bases of 2729890658 (2.437%) in intersection bigBedInfo GRCm38B.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 -# itemCount: 346,400 -# primaryDataSize: 8,843,385 -# primaryIndexSize: 38,860 +# itemCount: 857,692 +# primaryDataSize: 22,480,849 +# primaryIndexSize: 62,968 # zoomLevels: 9 -# chromCount: 407 -# basesCovered: 114,644,428 -# meanDepth (of bases covered): 21.207643 +# chromCount: 61 +# basesCovered: 1,408,486,330 +# meanDepth (of bases covered): 5.076515 # minDepth: 1.000000 -# maxDepth: 298.000000 -# std of depth: 35.518221 +# maxDepth: 240.000000 +# std of depth: 8.787741 ######################################################################### -# ucscToINSDC and ucscToRefSeq table/track (TBD - 2019-02-28 - Hiram) - # construct idKeys for the refseq sequence - mkdir /hive/data/genomes/GRCm38B/refseq/idKeys - cd /hive/data/genomes/GRCm38B/refseq/idKeys - faToTwoBit ../GCF_000002315.5_GRCg6a_genomic.fna.gz GRCm38B.refSeq.2bit - - time (doIdKeys.pl -buildDir=`pwd` \ - -twoBit=`pwd`/GRCm38B.refSeq.2bit refseqGRCm38B) > do.log 2>&1 & - # real 0m48.786s - - cat refseqGRCm38B.keySignature.txt - # 7850e2d5dabb6134fdc9d7083f1a3a54 - - # and the genbank sequence needs keys too: - mkdir /hive/data/genomes/GRCm38B/refseq/idKeysGenbank - cd /hive/data/genomes/GRCm38B/refseq/idKeysGenbank - faToTwoBit /hive/data/outside/ncbi/genomes/genbank/vertebrate_other/Gallus_gallus/all_assembly_versions/GCA_000002315.5_GRCg6a/GCA_000002315.5_GRCg6a_genomic.fna.gz GRCm38B.genbank.2bit - - time (doIdKeys.pl -buildDir=`pwd` \ - -twoBit=`pwd`/GRCm38B.genbank.2bit genbankGRCm38B) > do.log 2>&1 & - - cat genbankGRCm38B.keySignature.txt - # a20fdad3318d371fcb34fcc66bab3752 +# ucscToSanger and ucscToRefSeq table/track (DONE - 2019-03-01 - Hiram) + # using the name correspondence listings established before - mkdir /hive/data/genomes/GRCm38B/bed/chromAlias - - join -t$'\t' ../idKeys/GRCm38B.idKeys.txt \ - ../../refseq/idKeysGenbank/genbankGRCm38B.idKeys.txt | cut -f2- \ - | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ - | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ - | sort -k1,1 -k2,2n > ucscToINSDC.bed + mkdir /hive/data/genomes/GRCm38B/bed/ucscToINSDC + cd /hive/data/genomes/GRCm38B/bed/ucscToINSDC - join -t$'\t' ../idKeys/GRCm38B.idKeys.txt \ - ../../refseq/idKeys/refseqGRCm38B.idKeys.txt | cut -f2- \ - | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ - | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ +( join -2 2 -t$'\t' <(awk -F$'\t' 'BEGIN{OFS="\t"}{print $1,$3}' ../../ucsc/sequence.name.Xref.tab | sort) \ + <( awk -F$'\t' 'BEGIN{OFS="\t"}{printf "%s\t%s\n", $2, $1}' ../../ucsc/seqName.to.UCSC.name.txt | sort -k2) | awk '{printf "%s\t%s\n", $3,$2}' \ + | sort | join -t$'\t' <(sort ../../chrom.sizes) - \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' ; \ +printf "chrM\t0\t16299\tNC_005089.1\n" ) \ | sort -k1,1 -k2,2n > ucscToRefSeq.bed - # should be same line counts throughout: - wc -l * ../../chrom.sizes - # 463 ucscToINSDC.bed - # 464 ucscToRefSeq.bed - # 464 ../../chrom.sizes - - # need to find the accession for the INSDC equivalent to chrM: - egrep chrM * -# ucscToRefSeq.bed:chrM 0 16775 NC_001323.1 - # lookup that accession at NCBI Entrez: X52392.1 - # and add to ucscToINSDC.bed: - printf "chrM\t0\t16775\tX52392.1\n" >> ucscToINSDC.bed - # verify: - grep chrM * -# ucsc.genbank.tab:chrM X52392.1 -# ucsc.refseq.tab:chrM NC_001323.1 -# ucscToINSDC.bed:chrM 0 16775 X52392.1 -# ucscToRefSeq.bed:chrM 0 16775 NC_001323.1 - - export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` - echo $chrSize - # 27 - # use the $chrSize in this sed - sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ - | hgLoadSqlTab GRCm38B ucscToINSDC stdin ucscToINSDC.bed - # should be the same for ucscToRefSeq: +(awk -F$'\t' 'BEGIN{OFS="\t"}{printf "%s\t%s\n", $2, $1}' ../../ucsc/seqName.to.UCSC.name.txt \ + | sort | join -t$'\t' <(sort ../../chrom.sizes) - \ + | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' ; \ +printf "chrM\t0\t16299\tMT\n" ) \ + | sort -k1,1 -k2,2n > ucscToSanger.bed + export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 27 +# use the $chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ - | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab GRCm38B ucscToRefSeq stdin ucscToRefSeq.bed +# same $chrSize in this sed + +sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ + | hgLoadSqlTab GRCm38B ucscToSanger stdin ucscToSanger.bed + # should be quiet for all OK checkTableCoords GRCm38B # should cover %100 entirely: - featureBits -countGaps GRCm38B ucscToINSDC - # 1065365425 bases of 1065365425 (100.000%) in intersection + featureBits -countGaps GRCm38B ucscToSanger + # 2729890658 bases of 2729890658 (100.000%) in intersection featureBits -countGaps GRCm38B ucscToRefSeq - # 1065365425 bases of 1065365425 (100.000%) in intersection + # 2729890658 bases of 2729890658 (100.000%) in intersection ######################################################################### -# add chromAlias table (TBD - 2018-10-12 - ChrisL) +# add chromAlias table (DONE - 2019-03-01 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/chromAlias cd /hive/data/genomes/GRCm38B/bed/chromAlias hgsql -N -e 'select chrom,name from ucscToRefSeq;' GRCm38B \ | sort -k1,1 > ucsc.refseq.tab - hgsql -N -e 'select chrom,name from ucscToINSDC;' GRCm38B \ - | sort -k1,1 > ucsc.genbank.tab - - ### Adding Ensembl alias with v95 release, after idKeys made: 2019-01-16 - join -t$'\t' ../idKeys/GRCm38B.idKeys.txt \ - ../../ens95/ensGRCm38B.idKeys.txt | cut -f2- \ - | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ - | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ - | sort -k1,1 -k2,2n > ucscToEns.bed - cut -f1,4 ucscToEns.bed | sort > ucsc.ensembl.tab - wc -l *.bed -# 2210 ucscToEns.bed -# 2211 ucscToINSDC.bed -# 2211 ucscToRefSeq.bed + hgsql -N -e 'select chrom,name from ucscToSanger;' GRCm38B \ + | sort -k1,1 > ucsc.sanger.tab + + wc -l *.tab +# 64 ucsc.refseq.tab +# 64 ucsc.sanger.tab ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > GRCm38B.chromAlias.tab -for t in refseq genbank ensembl +for t in refseq sanger do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t GRCm38B.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done -# checking refseq: 464 =? 464 OK -# checking genbank: 464 =? 464 OK -# checking ensembl: 464 =? 464 OK +# checking refseq: 64 =? 64 OK +# checking sanger: 64 =? 64 OK hgLoadSqlTab GRCm38B chromAlias ~/kent/src/hg/lib/chromAlias.sql \ GRCm38B.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (TBD - 2019-02-28 - Hiram) - cd ~/kent/src/hg/makeDb/trackDb/chicken/GRCm38B + cd ~/kent/src/hg/makeDb/trackDb/mouse/GRCm38B # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" GRCm38B \ | sed -e 's/[0-9][0-9]*//;' | sort | uniq -c 1519 AADN.1 124 AC.1 313 AC.2 328 AC.3 74 AC.4 20 AC.5 1 AC.6 1 NC_.1 # implies a rule: '[AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)?' # verify this rule will find them all and eliminate them all: @@ -479,323 +440,361 @@ searchTable gold shortCircuit 1 termRegex [AN][AC][D0-9_][N0-9][0-9]+(\.[0-9]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box ########################################################################## # running repeat masker (DONE - 2019-02-28 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/repeatMasker cd /hive/data/genomes/GRCm38B/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku GRCm38B) > do.log 2>&1 -XXX - running Thu Feb 28 14:41:37 PST 2019 # real 48m25.181s cat faSize.rmsk.txt -# 1065365425 bases (9784466 N's 1055580959 real 922186059 upper -# 133394900 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %12.52 masked total, %12.64 masked real +# 2729890658 bases (74605238 N's 2655285420 real 1470398475 upper +# 1184886945 lower) in 64 sequences in 1 files +# Total size: mean 42654541.5 sd 64150638.3 min 1976 +# (chr4_GPS_017319191v1_random) max 195284574 (chr1) median 191905 +# %43.40 masked total, %44.62 masked real egrep -i "versi|relea" do.log - # RepeatMasker version open-4.0.7 - # February 01 2017 (open-4-0-7) 1.331 version of RepeatMasker - # CC Dfam_Consensus RELEASE 20170127; * - # CC RepBase RELEASE 20170127; +# February 01 2017 (open-4-0-8) 1.332 version of RepeatMasker +# CC Dfam_Consensus RELEASE 20181026; * +# CC RepBase RELEASE 20181026; time featureBits -countGaps GRCm38B rmsk - # 133395265 bases of 1065365425 (12.521%) in intersection - # real 0m4.226s + # 1184890066 bases of 2729890658 (43.404%) in intersection + # real 0m23.932s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' GRCm38B \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" - # total 133395265.000000 - # real 0m3.198s + # total 1184890066.000000 + # real 0m22.132s ########################################################################## # running simple repeat (DONE - 2019-02-28 - Hiram) # The '-trf409 4' is a bit smaller than human which is 6 mkdir /hive/data/genomes/GRCm38B/bed/simpleRepeat cd /hive/data/genomes/GRCm38B/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409=4 GRCm38B) > do.log 2>&1 -XXX - running - Thu Feb 28 14:42:10 PST 2019 # real 58m3.288s cat fb.simpleRepeat # 31110690 bases of 1055588482 (2.947%) in intersection cd /hive/data/genomes/GRCm38B # using the Window Masker result: cd /hive/data/genomes/GRCm38B - twoBitMask bed/windowMasker/GRCm38B.cleanWMSdust.2bit \ - -add bed/simpleRepeat/trfMask.bed GRCm38B.2bit +# twoBitMask bed/windowMasker/GRCm38B.cleanWMSdust.2bit \ +# -add bed/simpleRepeat/trfMask.bed GRCm38B.2bit # you can safely ignore the warning about fields >= 13 # add to rmsk after it is done: -# twoBitMask GRCm38B.rmsk.2bit \ -# -add bed/simpleRepeat/trfMask.bed GRCm38B.2bit + twoBitMask GRCm38B.rmsk.2bit \ + -add bed/simpleRepeat/trfMask.bed GRCm38B.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa GRCm38B.2bit stdout | faSize stdin > faSize.GRCm38B.2bit.txt - cat faSize.GRCm38B.2bit.txt -# 1065365425 bases (9784466 N's 1055580959 real 829559086 upper -# 226021873 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %21.22 masked total, %21.41 masked real + cat faSize.GRCm38B.2bit.txt | fold -s -w 72 | sed -e 's/^/# /;' +# 2729890658 bases (74605238 N's 2655285420 real 1468299458 upper +# 1186985962 lower) in 64 sequences in 1 files +# Total size: mean 42654541.5 sd 64150638.3 min 1976 +# (chr4_GPS_017319191v1_random) max 195284574 (chr1) median 191905 +# %43.48 masked total, %44.70 masked real rm /gbdb/GRCm38B/GRCm38B.2bit ln -s `pwd`/GRCm38B.2bit /gbdb/GRCm38B/GRCm38B.2bit ######################################################################### -# CREATE MICROSAT TRACK (TBD - 2019-02-28 - Hiram) +# CREATE MICROSAT TRACK (DONE - 2019-03-01 - Hiram) ssh hgwdev mkdir /cluster/data/GRCm38B/bed/microsat cd /cluster/data/GRCm38B/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed GRCm38B microsat microsat.bed - # Read 1745 elements of size 4 from microsat.bed + # Read 197377 elements of size 4 from microsat.bed ########################################################################## -## WINDOWMASKER (TBD - 2019-02-28 - Hiram) +## WINDOWMASKER (DONE - 2019-03-01 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/windowMasker cd /hive/data/genomes/GRCm38B/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev GRCm38B) > do.log 2>&1 # real 26m58.753s # Masking statistics cat faSize.GRCm38B.cleanWMSdust.txt -# 1065365425 bases (9784466 N's 1055580959 real 830149186 upper -# 225431773 lower) in 464 sequences in 1 files -# Total size: mean 2296046.2 sd 14494999.8 min 87 (chrUn_NW_020109844v1) -# max 197608386 (chr1) median 10066 -# %21.16 masked total, %21.36 masked real +# 2729890658 bases (74605238 N's 2655285420 real 1670953600 upper +# 984331820 lower) in 64 sequences in 1 files +# Total size: mean 42654541.5 sd 64150638.3 min 1976 +# (chr4_GPS_017319191v1_random) max 195284574 (chr1) median 191905 +# %36.06 masked total, %37.07 masked real cat fb.GRCm38B.rmsk.windowmaskerSdust.txt - # 86091413 bases of 1065365425 (8.081%) in intersection + # 751642088 bases of 2729890658 (27.534%) in intersection ########################################################################## -# cpgIslands - (TBD - 2019-02-28 - Hiram) +# cpgIslands - (DONE - 2019-03-01 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/cpgIslands cd /hive/data/genomes/GRCm38B/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku GRCm38B) > do.log 2>&1 +XXX - running - Fri Mar 1 11:03:58 PST 2019 # real 2m5.105s cat fb.GRCm38B.cpgIslandExt.txt # 16395346 bases of 1055588482 (1.553%) in intersection ############################################################################## -# genscan - (TBD - 2019-02-28 - Hiram) +# genscan - (DONE - 2019-03-01 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/genscan cd /hive/data/genomes/GRCm38B/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku GRCm38B) > do.log 2>&1 +XXX - running - Fri Mar 1 11:05:11 PST 2019 # real 88m34.900s cat fb.GRCm38B.genscan.txt # 23911678 bases of 1055588482 (2.265%) in intersection cat fb.GRCm38B.genscanSubopt.txt # 24521608 bases of 1055588482 (2.323%) in intersection ######################################################################### -# Create kluster run files (TBD - 2019-02-28 - Hiram) +# Create kluster run files (DONE - 2019-03-01 - Hiram) # numerator is GRCm38B gapless bases "real" as reported by: featureBits -noRandom -noHap GRCm38B gap - # 9758843 bases of 1040397755 (0.938%) in intersection + # 74582633 bases of 2650898416 (2.813%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: - calc \( 1040397755 / 2861349177 \) \* 1024 - # ( 1040397755 / 2861349177 ) * 1024 = 372.330406 + calc \( 2650898416 / 2861349177 \) \* 1024 + # ( 2650898416 / 2861349177 ) * 1024 = 948.685326 - # ==> use -repMatch=350 according to size scaled down from 1024 for human. + # ==> use -repMatch=900 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/GRCm38B blat GRCm38B.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/GRCm38B.11.ooc \ - -repMatch=350 - # Wrote 18169 overused 11-mers to jkStuff/GRCm38B.11.ooc + -repMatch=900 + # Wrote 31833 overused 11-mers to jkStuff/GRCm38B.11.ooc + + # mm10 had at repMatch=900 + # Wrote 31822 overused 11-mers to jkStuff/mm10.11.ooc # check non-bridged gaps to see what the typical size is: hgsql -N \ -e 'select * from gap where bridge="no" order by size;' GRCm38B \ | sort -k7,7nr | ave -col=7 stdin - # minimum gap size is 10 and produces a reasonable number of lifts - gapToLift -verbose=2 -minGap=10 GRCm38B jkStuff/nonBridged.lft \ - -bedFile=jkStuff/nonBridged.bed - wc -l jkStuff/nonBri* - # 525 jkStuff/nonBridged.bed - # 525 jkStuff/nonBridged.lft +# Q1 50000.000000 +# median 50000.000000 +# Q3 100000.000000 +# average 422673.295455 +# min 30000.000000 +# max 3100000.000000 +# count 176 +# total 74390500.000000 +# standard deviation 936342.502856 + + # minimum gap size is 10000 and produces a reasonable number of lifts + gapToLift -verbose=2 -minGap=10000 GRCm38B jkStuff/GRCm38B.nonBridged.lft \ + -bedFile=jkStuff/GRCm38B.nonBridged.bed + wc -l jkStuff/GRCm38B.nonBri* + # 198 jkStuff/GRCm38B.nonBridged.bed + # 198 jkStuff/GRCm38B.nonBridged.lft ######################################################################## -# lastz/chain/net swap human/hg38 (TBD - 2018-10-12 - Hiram) +# lastz/chain/net swap human/hg38 (DONE - 2019-03-01 - Hiram) # original alignment - cd /hive/data/genomes/hg38/bed/lastzGRCm38B.2018-10-12 + + cd /hive/data/genomes/hg38/bed/lastzGRCm38B.2019-03-01 cat fb.hg38.chainGRCm38BLink.txt - # 154079940 bases of 3095998939 (4.977%) in intersection + # 967404497 bases of 3095998939 (31.247%) in intersection cat fb.hg38.chainSynGRCm38BLink.txt - # 95877644 bases of 3095998939 (3.097%) in intersection + # 913717211 bases of 3095998939 (29.513%) in intersection cat fb.hg38.chainRBest.GRCm38B.txt - # 106665747 bases of 3095998939 (3.445%) in intersection + # 891970056 bases of 3095998939 (28.810%) in intersection # and for the swap: mkdir /hive/data/genomes/GRCm38B/bed/blastz.hg38.swap cd /hive/data/genomes/GRCm38B/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/hg38/bed/lastzGRCm38B.2018-10-12/DEF \ - -swap -chainMinScore=5000 -chainLinearGap=loose \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > swap.log 2>&1 - # real 9m45.514s + /hive/data/genomes/hg38/bed/lastzGRCm38B.2019-03-01/DEF \ + -swap -chainMinScore=3000 -chainLinearGap=medium \ + -workhorse=hgwdev -smallClusterHub=hgwdev-101 -bigClusterHub=ku \ + -noDbNameCheck -syntenicNet) > swap.log 2>&1 + # real 60m31.849s cat fb.GRCm38B.chainHg38Link.txt - # 120955955 bases of 1055588482 (11.459%) in intersection - + # 941205213 bases of 2655285420 (35.446%) in intersection cat fb.GRCm38B.chainSynHg38Link.txt - # 92597630 bases of 1055588482 (8.772%) in intersection + # 891450770 bases of 2655285420 (33.573%) in intersection - time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` GRCm38B hg38) > rbest.log 2>&1 & - # real 139m24.408s + time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` GRCm38B hg38) \ + > rbest.log 2>&1 & + # real 331m35.578s cat fb.GRCm38B.chainRBest.Hg38.txt - # 106294585 bases of 1055588482 (10.070%) in intersection + # 893587236 bases of 2655285420 (33.653%) in intersection ######################################################################### -# lastz/chain/net swap mouse/mm10 (TBD - 2018-10-12 - Hiram) +# LIFTOVER TO mm10 (DONE - 2019-03-01 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/GRCm38B/bed/blat.mm10.2019-03-01 + cd /hive/data/genomes/GRCm38B/bed/blat.mm10.2019-03-01 + # something has broken since the /scratch/data/ links have disappeared, + # need to fully specify from(target) and to(query) sequences: + doSameSpeciesLiftOver.pl -verbose=2 \ + -fileServer=hgwdev \ + -query2Bit=/hive/data/genomes/GRCm38B/GRCm38B.2bit \ + -querySizes=/hive/data/genomes/GRCm38B/chrom.sizes \ + -target2Bit=/hive/data/genomes/mm10/mm10.2bit \ + -targetSizes=/hive/data/genomes/mm10/chrom.sizes \ + -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ + GRCm38B mm10 - # original alignment - cd /hive/data/genomes/mm10/bed/lastzGRCm38B.2018-10-12 - cat fb.mm10.chainGRCm38BLink.txt - # 101151132 bases of 2652783500 (3.813%) in intersection - cat fb.mm10.chainSynGRCm38BLink.txt - # 70707720 bases of 2652783500 (2.665%) in intersection - cat fb.mm10.chainRBest.GRCm38B.txt - # 79649474 bases of 2652783500 (3.002%) in intersection + time (doSameSpeciesLiftOver.pl -verbose=2 \ + -fileServer=hgwdev \ + -query2Bit=/hive/data/genomes/GRCm38B/GRCm38B.2bit \ + -querySizes=/hive/data/genomes/GRCm38B/chrom.sizes \ + -target2Bit=/hive/data/genomes/mm10/mm10.2bit \ + -targetSizes=/hive/data/genomes/mm10/chrom.sizes \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ + GRCm38B mm10) > doLiftOverToMm10.log 2>&1 + # real 410m27.012s - # and for the swap: - mkdir /hive/data/genomes/GRCm38B/bed/blastz.mm10.swap - cd /hive/data/genomes/GRCm38B/bed/blastz.mm10.swap + # see if the liftOver menus function in the browser from GRCm38B to mm10 - time (doBlastzChainNet.pl -verbose=2 \ - /hive/data/genomes/mm10/bed/lastzGRCm38B.2018-10-12/DEF \ - -swap -chainMinScore=5000 -chainLinearGap=loose \ - -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ - -syntenicNet) > swap.log 2>&1 - # real 6m41.043s + # load up as a track for viewing: + mkdir /hive/data/genomes/GRCm38B/bed/liftOverPsl + cd /hive/data/genomes/GRCm38B/bed/liftOverPsl + ln -s ../blat.mm10.2019-03-01/GRCm38BToMm10.over.chain.gz . + hgLoadChain GRCm38B chainMm10 *.gz + # Loading 176 chains into GRCm38B.chainMm10 - cat fb.GRCm38B.chainMm10Link.txt - # 88539346 bases of 1055588482 (8.388%) in intersection + featureBits GRCm38B chainMm10 + # 2660658288 bases of 2655285420 (100.202%) in intersection - time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` GRCm38B mm10) > rbest.log 2>&1 & - # real 94m11.007s + # let's see what it looks like as a PSL + chainToPsl GRCm38BToMm10.over.chain.gz \ + /hive/data/genomes/mm10/chrom.sizes \ + /hive/data/genomes/GRCm38B/chrom.sizes \ + /hive/data/genomes/mm10/mm10.2bit \ + /hive/data/genomes/GRCm38B/GRCm38B.2bit \ + GRCm38BToMm10.over.chain.psl - cat fb.GRCm38B.chainRBest.Mm10.txt - # 79474812 bases of 1055588482 (7.529%) in intersection + pslSwap GRCm38BToMm10.over.chain.psl Mm10ToGRCm38B.over.chain.psl + + hgLoadPsl GRCm38B -table=pslChainMm10 Mm10ToGRCm38B.over.chain.psl ######################################################################### -# GENBANK AUTO UPDATE (TBD - 2018-10-12 - Hiram) +# GENBANK AUTO UPDATE (DONE - 2019-03-01 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # #organism mrnaCnt estCnt refSeqCnt - # Gallus gallus 30708 600485 6392 + # Mus musculus 581265 4871046 35622 # edit etc/genbank.conf to add GRCm38B just before galGal5 # GRCm38B (chicken/GCF_000002315.5_GRCg6a) GRCm38B.serverGenome = /hive/data/genomes/GRCm38B/GRCm38B.2bit GRCm38B.clusterGenome = /hive/data/genomes/GRCm38B/GRCm38B.2bit GRCm38B.ooc = /hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc GRCm38B.lift = /hive/data/genomes/GRCm38B/jkStuff/nonBridged.lft GRCm38B.perChromTables = no GRCm38B.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} GRCm38B.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} GRCm38B.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} GRCm38B.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} GRCm38B.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} GRCm38B.genbank.est.xeno.pslCDnaFilter = ${finished.genbank.est.xeno.pslCDnaFilter} GRCm38B.refseq.mrna.native.load = yes GRCm38B.refseq.mrna.xeno.load = yes GRCm38B.genbank.mrna.xeno.load = yes GRCm38B.downloadDir = GRCm38B # GRCm38B.upstreamGeneTbl = refGene # GRCm38B.upstreamMaf = multiz7way /hive/data/genomes/galGal4/bed/multiz7way/species.lst # verify the files specified exist before checking in the file: grep ^GRCm38B etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og -# -rw-rw-r-- 1 313201328 Oct 11 15:51 /hive/data/genomes/GRCm38B/GRCm38B.2bit -# -rw-rw-r-- 1 313201328 Oct 11 15:51 /hive/data/genomes/GRCm38B/GRCm38B.2bit -# -rw-rw-r-- 1 72684 Oct 11 15:56 /hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc -# -rw-rw-r-- 1 29513 Oct 11 15:57 /hive/data/genomes/GRCm38B/jkStuff/nonBridged.lft +# -rw-rw-r-- 1 714715484 Mar 1 11:01 /hive/data/genomes/GRCm38B/GRCm38B.2bit +# -rw-rw-r-- 1 127340 Mar 1 11:07 /hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc +# -rw-rw-r-- 1 9030 Mar 1 11:19 /hive/data/genomes/GRCm38B/jkStuff/GRCm38B.nonBridged.lft - git commit -m "Added GRCm38B; refs #22113" etc/genbank.conf + git commit -m "Added GRCm38B; refs #22271" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update + # adding to src/lib/gbGenome.c + # {"GRCm", mmNames}, + git commit -m 'Adding GRCm pointing to mmNames refs #22271' src/lib/gbGenome.c + git push + make install-server + # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add GRCm38B to: - # etc/align.dbs etc/hgwdev.dbs - git add etc/align.dbs etc/hgwdev.dbs - git commit -m "Added GRCm38B - chicken refs #22113" etc/hgwdev.dbs + # etc/hgwdev.dbs + git add etc/hgwdev.dbs + git commit -m "Added GRCm38B - mouse refs #22271" etc/hgwdev.dbs git push make etc-update # wait a few days for genbank magic to take place, the tracks will # appear ############################################################################# -# augustus gene track (TBD - 2018-10-12 - Hiram) +# augustus gene track (DONE - 2019-03-01 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/augustus cd /hive/data/genomes/GRCm38B/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ - -species=chicken -dbHost=hgwdev \ + -species=human -dbHost=hgwdev \ -workhorse=hgwdev GRCm38B) > do.log 2>&1 - # real 48m48.597s + # real 104m16.007s cat fb.GRCm38B.augustusGene.txt - # 25827925 bases of 1055588482 (2.447%) in intersection + # 49274221 bases of 2655285420 (1.856%) in intersection ######################################################################### # ncbiRefSeq (TBD - 2018-10-12 - Hiram) mkdir /hive/data/genomes/GRCm38B/bed/ncbiRefSeq cd /hive/data/genomes/GRCm38B/bed/ncbiRefSeq # running step wise just to be careful time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_other Gallus_gallus \ GCF_000002315.5_GRCg6a GRCm38B) > download.log 2>&1 # real 1m19.029s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ @@ -827,67 +826,34 @@ # this makes the 'protein' link disappear from the gene details page # curious that this gene is marked as a non-coding gene anyway ? # gene: FET1 at chr4:63,102,774-63,105,516- featureBits -enrichment GRCm38B refGene ncbiRefSeq # refGene 1.374%, ncbiRefSeq 8.397%, both 1.370%, cover 99.73%, enrich 11.88x featureBits -enrichment GRCm38B ncbiRefSeq refGene # ncbiRefSeq 8.397%, refGene 1.374%, both 1.370%, cover 16.32%, enrich 11.88x featureBits -enrichment GRCm38B ncbiRefSeqCurated refGene # ncbiRefSeqCurated 1.368%, refGene 1.374%, both 1.364%, cover 99.71%, enrich 72.59x featureBits -enrichment GRCm38B refGene ncbiRefSeqCurated # refGene 1.374%, ncbiRefSeqCurated 1.368%, both 1.364%, cover 99.32%, enrich 72.59x ######################################################################### -# LIFTOVER TO galGal5 (TBD - 2019-02-28 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/GRCm38B/bed/blat.galGal5.2019-02-28 - cd /hive/data/genomes/GRCm38B/bed/blat.galGal5.2019-02-28 - doSameSpeciesLiftOver.pl -verbose=2 \ - -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ - GRCm38B galGal5 - time (doSameSpeciesLiftOver.pl -verbose=2 \ - -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ - GRCm38B galGal5) > doLiftOverToGalGal5.log 2>&1 - # real 156m30.215s - - # see if the liftOver menus function in the browser from GRCm38B to galGal5 - -######################################################################### -# LIFTOVER TO galGal4 (TBD - 2018-10-12 - Hiram) - ssh hgwdev - mkdir /hive/data/genomes/GRCm38B/bed/blat.galGal4.2018-10-12 - cd /hive/data/genomes/GRCm38B/bed/blat.galGal4.2018-10-12 - doSameSpeciesLiftOver.pl -verbose=2 \ - -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ - GRCm38B galGal4 - time (doSameSpeciesLiftOver.pl -verbose=2 \ - -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ - -ooc=/hive/data/genomes/GRCm38B/jkStuff/GRCm38B.11.ooc \ - GRCm38B galGal4) > doLiftOverToGalGal4.log 2>&1 & - # real 36m10.254s - - # see if the liftOver menus function in the browser from GRCm38B to galGal5 - -######################################################################### # BLATSERVERS ENTRY (TBD - 2018-10-12 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev +XXX - requested - Fri Mar 1 17:36:11 PST 2019 hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("GRCm38B", "blat1a", "17892", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("GRCm38B", "blat1a", "17893", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ ## reset default position to MEPE gene (egg shell protein) ## (TBD - 2018-10-12 - Hiram) # as found from the galGal5 to GRCm38B liftOver ssh hgwdev hgsql -e 'update dbDb set defaultPos="chr4:45667017-45672928"