97cc4f47f34fca7bebb8bc8733d54a57a8d98839 hiram Fri Dec 9 14:25:51 2022 -0800 liftOver canFam6 to canFam2 refs #30374 diff --git src/hg/makeDb/doc/canFam6/initialBuild.txt src/hg/makeDb/doc/canFam6/initialBuild.txt index 2a3b381..1b54cd9 100644 --- src/hg/makeDb/doc/canFam6/initialBuild.txt +++ src/hg/makeDb/doc/canFam6/initialBuild.txt @@ -1,1314 +1,1339 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the canFam6 # GCF_000002285.5_Dog10K_Boxer_Tasha # Can use existing photograph (otherwise find one before starting here) ######################################################################### # Initial steps, reuse existing photograph (DONE - 2021-05-10 - Hiram) # To start this initialBuild.txt document, from a previous assembly document: mkdir ~/kent/src/hg/makeDb/doc/canFam6 cd ~/kent/src/hg/makeDb/doc/canFam6 sed -e 's/Fam5/Fam6/g; s/DONE/TBD/g;' \ ../canFam5/initialBuild.txt > initialBuild.txt cd /hive/data/genomes/canFam6 # This is the same individual as canFam3 cp -p ../canFam3/photoReference.txt . cat photoReference.txt ## download from NCBI mkdir /hive/data/genomes/canFam6/refseq cd /hive/data/genomes/canFam6/refseq time rsync -L -a -P --stats \ rsync://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/002/285/GCF_000002285.5_Dog10K_Boxer_Tasha/ ./ # sent 4,942 bytes received 4,079,561,723 bytes 48,278,895.44 bytes/sec # total size is 4,078,547,567 speedup is 1.00 # real 1m23.798s # this information is from the top of # canFam6/refseq/*_assembly_report.txt # (aka: canFam6/refseq/GCF_000002285.5_Dog10K_Boxer_Tasha_assembly_report.txt) # Assembly name: Dog10K_Boxer_Tasha # Organism name: Canis lupus familiaris (dog) # Infraspecific name: breed=boxer # Sex: female # Taxid: 9615 # BioSample: SAMN02953603 # BioProject: PRJNA13179 # Submitter: Dog Genome Sequencing Consortium # Date: 2020-10-06 # Assembly type: haploid # Release type: major # Assembly level: Chromosome # Genome representation: full # WGS project: AAEX04 # Assembly method: Canu v. 1.8 # Expected final version: yes # Genome coverage: 100.0x # Sequencing technology: PacBio Sequel # RefSeq category: Representative Genome # GenBank assembly accession: GCA_000002285.4 # RefSeq assembly accession: GCF_000002285.5 # RefSeq assembly and GenBank assemblies identical: no # ## Assembly-Units: ## GenBank Unit Accession RefSeq Unit Accession Assembly-Unit name ## GCA_000000145.4 GCF_000000145.4 Primary Assembly ## GCA_013123455.1 GCF_000184225.1 non-nuclear # check assembly size for later reference: time faSize G*a_genomic.fna.gz # 2312802198 bases (58852 N's 2312743346 real 1566802244 upper # 745941102 lower) in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (NW_023329752.1) # max 122014068 (NC_006583.4) median 7637 # %32.25 masked total, %32.25 masked real # real 0m31.588s # Survey types of gaps: zgrep -v "^#" *gaps.txt.gz | cut -f5,6 | sort | uniq -c # 1015 within_scaffold paired-ends # And total size in gaps: zcat *gaps.txt.gz | grep -v "^#" | awk '{print $3-$2+1}' | ave stdin \ | sed -e 's/^/# /;' # Q1 25.000000 # median 25.000000 # Q3 100.000000 # average 57.969458 # min 10.000000 # max 100.000000 # count 1015 # total 58839.000000 # standard deviation 37.178047 ############################################################################# # establish config.ra file (DONE - 2021-05-10 - Hiram) cd /hive/data/genomes/canFam6 ~/kent/src/hg/utils/automation/prepConfig.pl canFam6 mammal dog \ refseq/*_assembly_report.txt > canFam6.config.ra # compare with previous version to see if it is sane: diff canFam6.config.ra ../canFam3/canFam3.config.ra # verify it really does look sane cat canFam6.config.ra # config parameters for makeGenomeDb.pl: db canFam6 clade mammal scientificName Canis lupus familiaris commonName Dog assemblyDate Oct. 2020 assemblyLabel Dog Genome Sequencing Consortium assemblyShortLabel Dog10K_Boxer_Tasha orderKey 4660 # mitochondrial sequence included in refseq release # mitoAcc NC_002008.4 mitoAcc none fastaFiles /hive/data/genomes/canFam6/ucsc/*.fa.gz agpFiles /hive/data/genomes/canFam6/ucsc/*.agp # qualFiles none dbDbSpeciesDir dog photoCreditURL http://www.genome.gov/dmd/img.cfm?node=Photos/Animals/Dog&id=79106 photoCreditName NHGRI press photos ncbiGenomeId 85 ncbiAssemblyId 8227741 ncbiAssemblyName Dog10K_Boxer_Tasha ncbiBioProject 13179 ncbiBioSample SAMN02953603 genBankAccessionID GCF_000002285.5 taxId 9615 ############################################################################# # setup UCSC named files (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/ucsc cd /hive/data/genomes/canFam6/ucsc # check for duplicate sequences: time faToTwoBit -noMask ../refseq/G*a_genomic.fna.gz refseq.2bit # real 0m32.374s twoBitDup refseq.2bit # no output is a good result, otherwise, would have to eliminate duplicates # the scripts creating the fasta here will be using this refseq.2bit file # remove it later # compare gaps with what the gaps.gz file reported: twoBitInfo -nBed refseq.2bit refseq.gap.bed awk '{print $3-$2}' *.gap.bed | ave stdin | sed -e 's/^/# /;' # Q1 25.000000 # median 25.000000 # Q3 100.000000 # average 57.249027 # min 1.000000 # max 100.000000 # count 1028 # total 58852.000000 # standard deviation 37.486982 # comparing with above, there are 13 bases here that are not # counted in the NCBI gaps file. See what the AGP says later on here. time ~/kent/src/hg/utils/automation/ucscCompositeAgp.pl \ ../refseq/G*a_genomic.fna.gz \ ../refseq/*_assembly_structure/Primary_Assembly NC_006583.4 chr1 NC_006584.4 chr2 NC_006585.4 chr3 NC_006586.4 chr4 NC_006587.4 chr5 NC_006588.4 chr6 NC_006589.4 chr7 NC_006590.4 chr8 NC_006591.4 chr9 NC_006592.4 chr10 NC_006593.4 chr11 NC_006594.4 chr12 NC_006595.4 chr13 NC_006596.4 chr14 NC_006597.4 chr15 NC_006598.4 chr16 NC_006599.4 chr17 NC_006600.4 chr18 NC_006601.4 chr19 NC_006602.4 chr20 NC_006603.4 chr21 NC_006604.4 chr22 NC_006605.4 chr23 NC_006606.4 chr24 NC_006607.4 chr25 NC_006608.4 chr26 NC_006609.4 chr27 NC_006610.4 chr28 NC_006611.4 chr29 NC_006612.4 chr30 NC_006613.4 chr31 NC_006614.4 chr32 NC_006615.4 chr33 NC_006616.4 chr34 NC_006617.4 chr35 NC_006618.4 chr36 NC_006619.4 chr37 NC_006620.4 chr38 NC_006621.4 chrX real 9m50.617s time ~/kent/src/hg/utils/automation/unplacedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # processed 107 sequences into chrUn.fa.gz # real 0m1.551s # there are no unlocalized in this assembly time ~/kent/src/hg/utils/automation/unlocalizedWithChroms.pl \ ../refseq/*_assembly_structure/Primary_Assembly # bash syntax here mitoAcc=`grep "^# mitoAcc" ../canFam6.config.ra | awk '{print $NF}'` printf "# mitoAcc %s\n" "$mitoAcc" # mitoAcc NC_002008.4 zcat \ ../refseq/*_assembly_structure/non-nuclear/assem*/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e "s/^$mitoAcc/chrM/;" > chrM.agp cat chrM.agp # chrM 1 16727 1 O NC_002008.4 1 16727 + printf ">chrM\n" > chrM.fa twoBitToFa -noMask refseq.2bit:$mitoAcc stdout | grep -v "^>" >> chrM.fa gzip chrM.fa faSize chrM.fa.gz # 16727 bases (0 N's 16727 real 16727 upper 0 lower) in 1 sequences in 1 files # verify fasta and AGPs agree time faToTwoBit *.fa.gz test.2bit # real 0m44.947s cat *.agp | checkAgpAndFa stdin test.2bit 2>&1 | tail -4 # All AGP and FASTA entries agree - both files are valid # and no sequence lost from orginal: twoBitToFa test.2bit stdout | faSize stdin # 2312802198 bases (58852 N's 2312743346 real 2312743346 upper 0 lower) # in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (chrUn_NW_023329752v1) # max 122014068 (chr1) median 7637 # same numbers as above (except for upper/lower masking) # 2312802198 bases (58852 N's 2312743346 real 1566802244 upper # 745941102 lower) in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (NW_023329752.1) # max 122014068 (NC_006583.4) median 7637 # Verify these AGP files define all the gaps: zgrep -w scaffold *.agp | awk '{print $3-$2+1}' | ave stdin # No numerical data column 1 of stdin # XXX It doesn't define any gaps at all, and they all the same: zgrep -v "^#" ../refseq/G*a_genomic_gaps.txt.gz \ | awk '{print $5}' | sort | uniq -c # 1015 within_scaffold # Need to make up gaps with a fake AGP # Would like to use the 'component' identifier to make a fake AGP zgrep -h -v "^#" \ ../refseq/G*ure/Primary_Assembly/assembled_chromosomes/AGP/*.comp.agp.gz \ | awk -F$'\t' '{printf "s/%s/%s/;\n", $1, $6}' > ncbiComponent.sed zgrep -h -v "^#" \ ../refseq/G*ure/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | awk -F$'\t' '{printf "s/%s/%s/;\n", $1, $6}' >> ncbiComponent.sed # using that sed file, convert the fasta names to component names # this works because there is only one component name per sequence, # wouldn't work if the AGP files were multiple lines twoBitToFa refseq.2bit stdout | sed -f ncbiComponent.sed \ | hgFakeAgp -minContigGap=1 -minScaffoldGap=200000 -singleContigs \ stdin fake.agp # convert those names back to UCSC chrom names zgrep -h -v "^#" chr*.agp | awk '{printf "s/%s/%s/;\n", $6,$1}' \ | sort > ncbi.ucsc.sed sed -f ncbi.ucsc.sed fake.agp > canFam6.fake.agp # should have only UCSC names in column 1 cut -f1 canFam6.fake.agp | sort | uniq -c | sort -rn | head | sed -e 's/^/#/;' # 247 chr32 # 193 chr34 # 161 chr1 # 157 chr6 # 155 chr3 # 119 chrX # 93 chr16 # 75 chr37 # 69 chr22 # 55 chr11 cut -f1 canFam6.fake.agp | sort | uniq -c | sort -rn | tail | sed -e 's/^/#/;' # 1 chrUn_NW_023329675v1 # 1 chrUn_NW_023329674v1 # 1 chrUn_NW_023329673v1 # 1 chrUn_NW_023329672v1 # 1 chrUn_NW_023329671v1 # 1 chrUn_NW_023329670v1 # 1 chrUn_NW_023329669v1 # 1 chrUn_NW_023329668v1 # 1 chrUn_NW_023329667v1 # 1 chrM # verify this AGP file functions correctly: checkAgpAndFa canFam6.fake.agp test.2bit 2>&1 | tail -4 # no longer need these temporary 2bit files rm test.2bit refseq.2bit refseq.2bit refseq.gap.bed fake.agp # Reset the AGP specification in canFam6.config.ra agpFiles /hive/data/genomes/canFam6/ucsc/canFam6.fake.agp ############################################################################# # Initial database build (DONE - 2021-05-12 - Hiram) # verify sequence and AGP are OK: cd /hive/data/genomes/canFam6 time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=agp canFam6.config.ra) > agp.log 2>&1 # real 1m56.533s # then finish it off: time (makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db canFam6.config.ra) > db.log 2>&1 # real 12m7.732s # check in the trackDb files created in TemporaryTrackDbCheckout/ # and add canFam6 to trackDb/makefile refs #27546 # fixing up the images reference to canFam6.jpg # temporary symlink until masked sequence is available cd /hive/data/genomes/canFam6 ln -s `pwd`/canFam6.unmasked.2bit /gbdb/canFam6/canFam6.2bit ############################################################################# # verify gap table vs NCBI gap file (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/gap cd /hive/data/genomes/canFam6/bed/gap zgrep -v "^#" ../../refseq/G*_gaps.txt.gz \ | awk '{printf "%s\t%d\t%d\t%s_%s\n", $1,$2-1,$3,$5,$6}' \ | sort -k1,1 -k2,2n > refseq.gap.bed # type survey: cut -f4 *.bed | sort | uniq -c # 1015 within_scaffold_paired-ends # how much defined by NCBI: awk '{print $3-$2}' *.bed | ave stdin | grep -w total # total 58839.000000 # how much in the gap table: hgsql -e 'select * from gap;' canFam6 | awk '{print $4-$3}' \ | ave stdin | grep -w total # total 58852.000000 # an extra 13 marked in the UCSC AGP file ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/cpgIslandsUnmasked cd /hive/data/genomes/canFam6/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/canFam6/canFam6.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku canFam6) > do.log 2>&1 # real 3m30.591s cat fb.canFam6.cpgIslandExtUnmasked.txt # 56535294 bases of 2481941580 (2.278%) in intersection ############################################################################# # cytoBandIdeo - (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/cytoBand cd /hive/data/genomes/canFam6/bed/cytoBand makeCytoBandIdeo.csh canFam6 ############################################################################# # run up idKeys files for chromAlias/ncbiRefSeq (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/idKeys cd /hive/data/genomes/canFam6/bed/idKeys time (doIdKeys.pl \ -twoBit=/hive/data/genomes/canFam6/canFam6.unmasked.2bit \ -buildDir=`pwd` canFam6) > do.log 2>&1 & # real 0m32.977s cat canFam6.keySignature.txt # efdede185d06fdca97de518811962150 ############################################################################# # gapOverlap (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/gapOverlap cd /hive/data/genomes/canFam6/bed/gapOverlap time (doGapOverlap.pl \ -twoBit=/hive/data/genomes/canFam6/canFam6.unmasked.2bit canFam6 ) \ > do.log 2>&1 & # real 19m39.531s # there are none, nothing to load # cat fb.canFam6.gapOverlap.txt # xxx bases of yyy (0.001%) in intersection ############################################################################# # tandemDups (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/tandemDups cd /hive/data/genomes/canFam6/bed/tandemDups time (~/kent/src/hg/utils/automation/doTandemDup.pl \ -twoBit=/hive/data/genomes/canFam6/canFam6.unmasked.2bit canFam6) \ > do.log 2>&1 & # real 172m17.400s cat fb.canFam6.tandemDups.txt # 38991000 bases of 2312802198 (1.686%) in intersection bigBedInfo canFam6.tandemDups.bb | sed -e 's/^/# /;' # version: 4 # fieldCount: 13 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 0 # itemCount: 657,594 # primaryDataSize: 17,541,691 # primaryIndexSize: 53,796 # zoomLevels: 8 # chromCount: 84 # basesCovered: 1,396,340,438 # meanDepth (of bases covered): 4.667856 # minDepth: 1.000000 # maxDepth: 251.000000 # std of depth: 8.938384 ######################################################################### # ucscToINSDC and ucscToRefSeq table/track (DONE - 2021-05-13 - Hiram) # construct idKeys for the genbank sequence mkdir /hive/data/genomes/canFam6/refseq/idKeys cd /hive/data/genomes/canFam6/refseq/idKeys faToTwoBit ../GCF_*a_genomic.fna.gz canFam6.refseq.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/canFam6.refseq.2bit refseqCanFam6) > do.log 2>&1 & # real 0m33.413s cat refseqCanFam6.keySignature.txt # efdede185d06fdca97de518811962150 mkdir -p /hive/data/genomes/canFam6/genbank/idKeys cd /hive/data/genomes/canFam6/genbank/idKeys ln -s /hive/data/outside/ncbi/genomes/GCA/000/002/285/GCA_000002285.4_Dog10K_Boxer_Tasha/GCA_000002285.4_Dog10K_Boxer_Tasha_genomic.fna.gz . faToTwoBit GCA*.fna.gz canFam6.genbank.2bit time (doIdKeys.pl -buildDir=`pwd` \ -twoBit=`pwd`/canFam6.genbank.2bit genbankCanFam6) > do.log 2>&1 & # real 0m32.596s cat genbankCanFam6.keySignature.txt # ec6e932a9b4aa21e9603895931320f13 mkdir /hive/data/genomes/canFam6/bed/chromAlias cd /hive/data/genomes/canFam6/bed/chromAlias join -t$'\t' ../idKeys/canFam6.idKeys.txt \ ../../genbank/idKeys/genbankCanFam6.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToINSDC.bed join -t$'\t' ../idKeys/canFam6.idKeys.txt \ ../../refseq/idKeys/refseqCanFam6.idKeys.txt | cut -f2- \ | sort -k1,1 | join -t$'\t' <(sort -k1,1 ../../chrom.sizes) - \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' \ | sort -k1,1 -k2,2n > ucscToRefSeq.bed # lookup the genbank accession for chrM NC_002008.4 -> U96639.2 # add it to the INSDC names: printf "chrM\t0\t16727\tU96639.2\n" >> ucscToINSDC.bed # should be same line counts throughout: wc -l * ../../chrom.sizes # 147 ucscToINSDC.bed # 147 ucscToRefSeq.bed # 147 ../../chrom.sizes export chrSize=`cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize # 20 # use the $chrSize in this sed sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab canFam6 ucscToINSDC stdin ucscToINSDC.bed export chrSize=`cut -f1 ucscToRefSeq.bed | awk '{print length($0)}' | sort -n | tail -1` echo $chrSize sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' \ | hgLoadSqlTab canFam6 ucscToRefSeq stdin ucscToRefSeq.bed # should be quiet for all OK checkTableCoords canFam6 ucscToINSDC checkTableCoords canFam6 ucscToRefSeq # should cover %100 entirely: featureBits -countGaps canFam6 ucscToINSDC # 2312802198 bases of 2312802198 (100.000%) in intersection featureBits -countGaps canFam6 ucscToRefSeq # 2312802198 bases of 2312802198 (100.000%) in intersection ######################################################################### # add chromAlias table (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/chromAlias cd /hive/data/genomes/canFam6/bed/chromAlias grep -v "^#" ../../refseq/G*_assembly_report.txt \ | awk '{printf "%s\t%s\n", $5,$1}' | sort > ncbi.genbank.txt grep -v "^#" ../../refseq/G*_assembly_report.txt \ | awk '{printf "%s\t%s\n", $7,$1}' | sort > ncbi.refseq.txt hgsql -N -e 'select chrom,name from ucscToINSDC;' canFam6 \ | sort -k1,1 > ucsc.genbank.tab hgsql -N -e 'select chrom,name from ucscToRefSeq;' canFam6 \ | sort -k1,1 > ucsc.refseq.tab # the awk removes lines where the UCSC name is identical to the NCBI name join -t$'\t' -1 2 <(sort -k2,2 ucsc.refseq.tab) ncbi.refseq.txt \ | cut -f2-3 | awk '$1 != $2' | sort > ucsc.assembly.tab wc -l * ../../chrom.sizes # 147 ncbi.genbank.txt # 147 ncbi.refseq.txt # 147 ucsc.assembly.tab # 147 ucsc.genbank.tab # 147 ucsc.refseq.tab # 147 ../../chrom.sizes # assembly counts are smaller since equivalence has been eliminated ~/kent/src/hg/utils/automation/chromAlias.pl ucsc.*.tab \ > canFam6.chromAlias.tab for t in refseq genbank assembly do c0=`cat ucsc.$t.tab | wc -l` c1=`grep $t canFam6.chromAlias.tab | wc -l` ok="OK" if [ "$c0" -ne "$c1" ]; then ok="ERROR" fi printf "# checking $t: $c0 =? $c1 $ok\n" done # checking refseq: 147 =? 147 OK # checking genbank: 147 =? 147 OK # checking assembly: 147 =? 147 OK # verify chrM is here properly: grep chrM canFam6.chromAlias.tab # MT chrM assembly # NC_002008.4 chrM refseq # U96639.2 chrM genbank hgLoadSqlTab canFam6 chromAlias ~/kent/src/hg/lib/chromAlias.sql \ canFam6.chromAlias.tab ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2021-05-13 - Hiram) cd ~/kent/src/hg/makeDb/trackDb/dog/canFam6 # preview prefixes and suffixes: hgsql -N -e "select frag from gold;" canFam6 \ | sed -e 's/[0-9_.]\+//;' | sort | uniq -c 1174 AAEX 1 NC # implies a rule: '[AN][AC][EX0-9_]+(\.[0-9_]+)?' # verify this rule will find them all and eliminate them all: hgsql -N -e "select frag from gold;" canFam6 | wc -l # 1175 hgsql -N -e "select frag from gold;" canFam6 \ | egrep -e '[AN][AC][EX0-9_]+(\.[0-9_]+)?' | wc -l # 1175 hgsql -N -e "select frag from gold;" canFam6 \ | egrep -v -e '[AN][AC][EX0-9_]+(\.[0-9_]+)?' | wc -l # 0 # hence, add to trackDb/rhesus/canFam6/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][AC][EX0-9_]+(\.[0-9_]+)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # verify searches work in the position box git commit -m 'adding search rule for gold/assembly track refs #27546' \ trackDb.ra ########################################################################## # running repeat masker (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/repeatMasker cd /hive/data/genomes/canFam6/bed/repeatMasker time (doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku canFam6) > do.log 2>&1 # real 294m16.121s cat faSize.rmsk.txt # 2312802198 bases (58852 N's 2312743346 real 1340725791 upper # 972017555 lower) in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (chrUn_NW_023329752v1) # max 122014068 (chr1) median 7637 # %42.03 masked total, %42.03 masked real egrep -i "versi|relea" do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ # CC Dfam_Consensus RELEASE 20181026; * # CC RepBase RELEASE 20181026; sed -e 's/^/# /;' versionInfo.txt # The repeat files provided for this assembly were generated using RepeatMasker. # Smit, AFA, Hubley, R & Green, P., # RepeatMasker Open-4.0. # 1996-2010 . # # VERSION: # RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ # Search Engine: Crossmatch [ 1.090518 ] # Master RepeatMasker Database: /hive/data/staging/data/RepeatMasker181121/Libraries/RepeatMaskerLib.embl ( Complete Database: dc20181026-rb20181026 ) # # # RepeatMasker version development-$Id: RepeatMasker,v 1.332 2017/04/17 19:01:11 rhubley Exp $ # CC Dfam_Consensus RELEASE 20181026; * # CC RepBase RELEASE 20181026; * # # RepeatMasker engine: -engine crossmatch -s # # RepeatMasker library options: -species 'Canis lupus familiaris' # # PARAMETERS: # /hive/data/staging/data/RepeatMasker/RepeatMasker -engine crossmatch -s -align -species 'Canis lupus familiaris' time featureBits -countGaps canFam6 rmsk # 972021595 bases of 2312802198 (42.028%) in intersection # real 0m34.798s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's # faster way to get the same result on high contig count assemblies: time hgsql -N -e 'select genoName,genoStart,genoEnd from rmsk;' canFam6 \ | bedSingleCover.pl stdin | ave -col=4 stdin | grep "^total" # total 972021595.000000 # real 0m21.176s ########################################################################## # running simple repeat (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/simpleRepeat cd /hive/data/genomes/canFam6/bed/simpleRepeat time (doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ -trf409=6 canFam6) > do.log 2>&1 # real 73m21.349s cat fb.simpleRepeat # 47306497 bases of 2312743346 (2.045%) in intersection cd /hive/data/genomes/canFam6 # if using the Window Masker result: cd /hive/data/genomes/canFam6 # twoBitMask bed/windowMasker/canFam6.cleanWMSdust.2bit \ # -add bed/simpleRepeat/trfMask.bed canFam6.2bit # you can safely ignore the warning about fields >= 13 # add to rmsk after it is done: twoBitMask canFam6.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed canFam6.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa canFam6.2bit stdout | faSize stdin > faSize.canFam6.2bit.txt cat faSize.canFam6.2bit.txt # 2312802198 bases (58852 N's 2312743346 real 1338850957 upper # 973892389 lower) in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (chrUn_NW_023329752v1) # max 122014068 (chr1) median 7637 # %42.11 masked total, %42.11 masked real rm /gbdb/canFam6/canFam6.2bit ln -s `pwd`/canFam6.2bit /gbdb/canFam6/canFam6.2bit ######################################################################### # CREATE MICROSAT TRACK (DONE - 2021-05-12 - Hiram) ssh hgwdev mkdir /cluster/data/canFam6/bed/microsat cd /cluster/data/canFam6/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed canFam6 microsat microsat.bed # Read 56108 elements of size 4 from microsat.bed ########################################################################## ## WINDOWMASKER (DONE - 2021-05-12 - Hiram) mkdir /hive/data/genomes/canFam6/bed/windowMasker cd /hive/data/genomes/canFam6/bed/windowMasker time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev canFam6) > do.log 2>&1 # real 89m33.579s # Masking statistics cat faSize.canFam6.cleanWMSdust.txt # 2312802198 bases (58852 N's 2312743346 real 1546969439 upper # 765773907 lower) in 147 sequences in 1 files # Total size: mean 15733348.3 sd 28607349.9 min 551 (chrUn_NW_023329752v1) # max 122014068 (chr1) median 7637 # %33.11 masked total, %33.11 masked real featureBits -countGaps canFam6 rmsk windowmaskerSdust \ 2> fb.canFam6.rmsk.windowmaskerSdust.txt cat fb.canFam6.rmsk.windowmaskerSdust.txt # 517620731 bases of 2312802198 (22.381%) in intersection time (doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -continue=cleanup -dbHost=hgwdev canFam6) > cleanup.log 2>&1 ########################################################################## # cpgIslands - (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/cpgIslands cd /hive/data/genomes/canFam6/bed/cpgIslands time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku canFam6) > do.log 2>&1 # real 2m52.170s cat fb.canFam6.cpgIslandExt.txt # 44591675 bases of 2312743346 (1.928%) in intersection ############################################################################## # genscan - (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/genscan cd /hive/data/genomes/canFam6/bed/genscan time (doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku canFam6) > do.log 2>&1 # real 48m34.968s sed -e 's/^/ # /;' fb.canFam6.genscan.txt # 54159469 bases of 2312743346 (2.342%) in intersection sed -e 's/^/ # /;' fb.canFam6.genscanSubopt.txt # 46763127 bases of 2312743346 (2.022%) in intersection ######################################################################### # Create kluster run files (DONE - 2021-05-13 - Hiram) # numerator is canFam6 gapless bases "real" as reported by: featureBits -noRandom -noHap canFam6 gap # 58852 bases of 2310615395 (0.003%) in intersection # ^^^ # denominator is hg19 gapless bases as reported by: # featureBits -noRandom -noHap hg19 gap # 234344806 bases of 2861349177 (8.190%) in intersection # 1024 is threshold used for human -repMatch: calc \( 2310615395 / 2861349177 \) \* 1024 # ( 2310615395 / 2861349177 ) * 1024 = 826.907175 # ==> use -repMatch=800 according to size scaled down from 1024 for human. # and rounded down to nearest 50 cd /hive/data/genomes/canFam6 time blat canFam6.2bit \ /dev/null /dev/null -tileSize=11 -makeOoc=jkStuff/canFam6.11.ooc \ -repMatch=800 # Wrote 28074 overused 11-mers to jkStuff/canFam6.11.ooc # real 0m24.061s # canFam5 at repMatch=800: # Wrote 28510 overused 11-mers to jkStuff/canFam5.11.ooc # canFam4 at repMatch=800: # Wrote 34718 overused 11-mers to jkStuff/canFam4.11.ooc # canFam3 at repMatch=900: # Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc # real 1m11.629s # there are no non-bridged gaps hgsql -N \ -e 'select * from gap where bridge="no" order by size;' canFam6 hgsql -N -e 'select size from gap where bridge="no" order by size;' \ canFam6 | sort | uniq -c | sort -k2,2n | sed -e 's/^/# /;' # survey gap sizes: hgsql -N -e 'select size from gap where bridge="yes" order by size;' \ canFam6 | ave stdin | sed -e 's/^/# /;' # Q1 25.000000 # median 25.000000 # Q3 100.000000 # average 57.249027 # min 1.000000 # max 100.000000 # count 1028 # total 58852.000000 # standard deviation 37.486982 # and survey these gaps: hgsql -N -e \ 'select size from gap where bridge="yes" order by size;' canFam6 \ | sort | uniq -c | sed -e 's/^/# /;' # 13 1 # 4 10 # 444 100 # 1 19 # 556 25 # 1 31 # 1 34 # 1 38 # 1 40 # 1 50 # 1 54 # 1 56 # 1 57 # 2 60 # using ordinary gaps to make a lift file # with minimum gap size at 100 gapToLift -allowBridged -verbose=2 -minGap=100 canFam6 \ jkStuff/canFam6.gaps.lft -bedFile=jkStuff/canFam6.gaps.bed wc -l jkStuff/*gaps* # 591 jkStuff/canFam6.gaps.bed # 591 jkStuff/canFam6.gaps.lft # to see the gaps used: bedInvert.pl chrom.sizes jkStuff/canFam6.gaps.bed | less # and their sizes: bedInvert.pl chrom.sizes jkStuff/canFam6.gaps.bed \ | cut -f4 | sort -n | uniq -c | less # 444 100 ######################################################################## # lastz/chain/net swap human/hg38 (TBD - 2020-07-29 - Hiram) # original alignment cd /hive/data/genomes/hg38/bed/lastzCanFam6.2020-07-29 cat fb.hg38.chainCanFam6Link.txt # 1545648756 bases of 3110768607 (49.687%) in intersection cat fb.hg38.chainSynCanFam6Link.txt # 1484758745 bases of 3110768607 (47.730%) in intersection cat fb.hg38.chainRBest.CanFam6.txt # 1422619513 bases of 3110768607 (45.732%) in intersection # and for the swap: mkdir /hive/data/genomes/canFam6/bed/blastz.hg38.swap cd /hive/data/genomes/canFam6/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzCanFam6.2020-07-29/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 78m37.078s cat fb.canFam6.chainHg38Link.txt # 1460025525 bases of 2337131234 (62.471%) in intersection cat fb.canFam6.chainSynHg38Link.txt # 1423305734 bases of 2337131234 (60.900%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ canFam6 hg38) > rbest.log 2>&1 & # real 255m9.076s cat fb.canFam6.chainRBest.Hg38.txt # 1422612399 bases of 2337131234 (60.870%) in intersection ############################################################################ # lastz/chain/net swap mouse/mm10 (TBD - 2020-07-29 - Hiram) # original alignment cd /hive/data/genomes/mm10/bed/lastzCanFam6.2020-07-29 cat fb.mm10.chainCanFam6Link.txt # 776486006 bases of 2652783500 (29.271%) in intersection cat fb.mm10.chainSynCanFam6Link.txt # 735561772 bases of 2652783500 (27.728%) in intersection cat fb.mm10.chainRBest.CanFam6.txt # 740117947 bases of 2652783500 (27.900%) in intersection mkdir /hive/data/genomes/canFam6/bed/blastz.mm10.swap cd /hive/data/genomes/canFam6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCanFam6.2020-07-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 44m9.935s cat fb.canFam6.chainMm10Link.txt # 759821061 bases of 2337131234 (32.511%) in intersection cat fb.canFam6.chainSynMm10Link.txt # 731350605 bases of 2337131234 (31.293%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev canFam6 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 162m30.634s cat fb.canFam6.chainRBest.Mm10.txt # 739177732 bases of 2337131234 (31.628%) in intersection ############################################################################ # lastz/chain/net swap mouse/mm39 (TBD - 2020-08-17 - Hiram) # original alignment cd /hive/data/genomes/mm39/bed/lastzCanFam6.2020-08-17 cat fb.mm39.chainCanFam6Link.txt # 778327929 bases of 2654624157 (29.320%) in intersection cat fb.mm39.chainSynCanFam6Link.txt # 735515331 bases of 2654624157 (27.707%) in intersection cat fb.mm39.chainRBest.CanFam6.txt # 740738480 bases of 2654624157 (27.904%) in intersection mkdir /hive/data/genomes/canFam6/bed/blastz.mm39.swap cd /hive/data/genomes/canFam6/bed/blastz.mm39.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm39/bed/lastzCanFam6.2020-08-17/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 44m12.732s cat fb.canFam6.chainMm39Link.txt # 762233776 bases of 2337131234 (32.614%) in intersection cat fb.canFam6.chainSynMm39Link.txt # 731337903 bases of 2337131234 (31.292%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev canFam6 mm39 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 174m14.398s cat fb.canFam6.chainRBest.Mm39.txt # 739648625 bases of 2337131234 (31.648%) in intersection ############################################################################## # GENBANK AUTO UPDATE (DONE - 2021-05-13 - Hiram) ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # /cluster/data/genbank/data/organism.lst shows: # organism mrnaCnt estCnt refSeqCnt # Canis latrans 2 0 0 # Canis lupus 36 0 0 # Canis lupus familiaris 3371 382652 1723 # Canis lupus laniger 2 0 0 # Canis lupus lupus 2 0 0 # Canis mesomelas 1 0 0 # Canis sp. 45 0 0 # the latrans is the Coyota, the mesomelas # is the Black-backed jackal from Africa and the langier is the Tibetan wolf # lupus lupus is the Eurasian wolf # edit etc/genbank.conf to add canFam6 just after canFam5 # canFam6 (GCF_000002285.5 Dog10K Boxer Tasha) canFam6.serverGenome = /hive/data/genomes/canFam6/canFam6.2bit canFam6.ooc = /hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc canFam6.lift = /hive/data/genomes/canFam6/jkStuff/canFam6.gaps.lft canFam6.align.unplacedChroms = chrUn_* canFam6.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} canFam6.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} canFam6.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} canFam6.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} canFam6.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} canFam6.refseq.mrna.native.load = yes canFam6.refseq.mrna.xeno.load = yes # DO NOT NEED genbank.mrna.xeno except for human, mouse canFam6.genbank.mrna.xeno.load = yes canFam6.downloadDir = canFam6 canFam6.upstreamGeneTbl = refGene canFam6.perChromTables = no # verify the files specified exist before checking in the file: grep ^canFam6 etc/genbank.conf | grep hive | awk '{print $NF}' | xargs ls -og # -rw-rw-r-- 1 607528981 May 12 22:06 /hive/data/genomes/canFam6/canFam6.2bit # -rw-rw-r-- 1 112304 May 13 10:40 /hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc # -rw-rw-r-- 1 24204 May 13 10:48 /hive/data/genomes/canFam6/jkStuff/canFam6.gaps.lft git commit -m "Added canFam6 dog; refs #27546" etc/genbank.conf git push # update /cluster/data/genbank/: make etc-update # enable daily alignment and update of hgwdev cd ~/kent/src/hg/makeDb/genbank git pull # add canFam6 to: # etc/hgwdev.dbs etc/align.dbs git commit -m "Added canFam6 - dog refs #27546" etc/hgwdev.dbs etc/align.dbs git push make etc-update # Notify Chris Lee this is ready to go. Magic will happen. ############################################################################# # augustus gene track (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/augustus cd /hive/data/genomes/canFam6/bed/augustus time (doAugustus.pl -buildDir=`pwd` -bigClusterHub=ku \ -species=human -dbHost=hgwdev \ -workhorse=hgwdev canFam6) > do.log 2>&1 # real 114m7.669s sed -e 's/^/ # /;' fb.canFam6.augustusGene.txt # 47489256 bases of 2312743346 (2.053%) in intersection ######################################################################### # ncbiRefSeq (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/ncbiRefSeq cd /hive/data/genomes/canFam6/bed/ncbiRefSeq # running step wise just to be careful time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ GCF_000002285.5_Dog10K_Boxer_Tasha canFam6) > download.log 2>&1 # real 2m5.429s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=process -bigClusterHub=ku -dbHost=hgwdev \ -stop=process -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ GCF_000002285.5_Dog10K_Boxer_Tasha canFam6) > process.log 2>&1 # real 3m31.265s time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -continue=load -bigClusterHub=ku -dbHost=hgwdev \ -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ GCF_000002285.5_Dog10K_Boxer_Tasha canFam6) > load.log 2>&1 # real 0m47.905s cat fb.ncbiRefSeq.canFam6.txt # 88916188 bases of 2312743346 (3.845%) in intersection # add: include ../../refSeqComposite.ra alpha # to the dog/canFam6/trackDb.ra to turn on the track in the browser featureBits -enrichment canFam6 refGene ncbiRefSeq # refGene 0.138%, ncbiRefSeq 3.845%, both 0.136%, cover 99.15%, enrich 25.79x featureBits -enrichment canFam6 ncbiRefSeq refGene # ncbiRefSeq 3.845%, refGene 0.138%, both 0.136%, cover 3.55%, enrich 25.79x featureBits -enrichment canFam6 ncbiRefSeqCurated refGene # ncbiRefSeqCurated 0.152%, refGene 0.138%, both 0.133%, cover 87.51%, enrich 636.15x featureBits -enrichment canFam6 refGene ncbiRefSeqCurated # refGene 0.138%, ncbiRefSeqCurated 0.152%, both 0.133%, cover 96.65%, enrich 636.15x ######################################################################### # LIFTOVER TO canFam5 (DONE - 2021-05-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam6/bed/blat.canFam5.2021-05-13 cd /hive/data/genomes/canFam6/bed/blat.canFam5.2021-05-13 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam5 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam5) > doLiftOverToCanFam5.log 2>&1 # real 179m37.613s # see if the liftOver menus function in the browser from canFam6 to canFam5 ######################################################################### # LIFTOVER TO canFam4 (DONE - 2021-05-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam6/bed/blat.canFam4.2021-05-13 cd /hive/data/genomes/canFam6/bed/blat.canFam4.2021-05-13 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam4 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam4) > doLiftOverToCanFam4.log 2>&1 # real 960m7.335s # see if the liftOver menus function in the browser from canFam6 to canFam4 ######################################################################### # LIFTOVER TO canFam3 (DONE - 2021-05-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam6/bed/blat.canFam3.2021-05-13 cd /hive/data/genomes/canFam6/bed/blat.canFam3.2021-05-13 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam3 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ canFam6 canFam3) > doLiftOverToCanFam3.log 2>&1 # real 2695m19.425s # see if the liftOver menus function in the browser from canFam6 to canFam3 ######################################################################### # BLATSERVERS ENTRY (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/dynamicBlat cd /hive/data/genomes/canFam6/dynamicBlat time gfServer -trans index canFam6.trans.gfidx ../canFam6.2bit & # real 5m27.906s time gfServer -stepSize=5 index canFam6.untrans.gfidx ../canFam6.2bit # real 3m3.944s rsync -a -P ../canFam6.2bit qateam@dynablat-01:/scratch/hubs/canFam6/ rsync -a -P canFam6.untrans.gfidx qateam@dynablat-01:/scratch/hubs/canFam6/ rsync -a -P canFam6.trans.gfidx qateam@dynablat-01:/scratch/hubs/canFam6/ hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr, dynamic) \ VALUES ("canFam6", "dynablat-01", "4040", "1", "0", "1"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr, dynamic) \ VALUES ("canFam6", "dynablat-01", "4040", "0", "1", "1");' \ hgcentraltest # test it with some sequence, for example ACE2: >hg38_ACE2 MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQ NMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQQNGSSVLSEDKSKRLNTIL NTMSTIYSTGKVCNPDNPQECLLLEPGLNEIMANSLDYNERLWAWESWRSEVGKQLRPLY EEYVVLKNEMARANHYEDYGDYWRGDYEVNGVDGYDYSRGQLIEDVEHTFEEIKPLYEHL HAYVRAKLMNAYPSYISPIGCLPAHLLGDMWGRFWTNLYSLTVPFGQKPNIDVTDAMVDQ AWDAQRIFKEAEKFFVSVGLPNMTQGFWENSMLTDPGNVQKAVCHPTAWDLGKGDFRILM CTKVTMDDFLTAHHEMGHIQYDMAYAAQPFLLRNGANEGFHEAVGEIMSLSAATPKHLKS IGLLSPDFQEDNETEINFLLKQALTIVGTLPFTYMLEKWRWMVFKGEIPKDQWMKKWWEM KREIVGVVEPVPHDETYCDPASLFHVSNDYSFIRYYTRTLYQFQFQEALCQAAKHEGPLH KCDISNSTEAGQKLFNMLRLGKSEPWTLALENVVGAKNMNVRPLLNYFEPLFTWLKDQNK NSFVGWSTDWSPYADQSIKVRISLKSALGDKAYEWNDNEMYLFRSSVAYAMRQYFLKVKN QMILFGEEDVRVANLKPRISFNFFVTAPKNVSDIIPRTEVEKAIRMSRSRINDAFRLNDN SLEFLGIQPTLGPPNQPPVSIWLIVFGVVMGVIVVGIVILIFTGIRDRKKKNKARSGENP YASIDISKGENNPGFQNTDDVQTSF ############################################################################ ## reset default position to gene: ACE2 as found by blat of human protein ## (DONE - 2021-05-13 - Hiram) ssh hgwdev hgsql -e 'update dbDb set defaultPos="chrX:11706333-11735291" where name="canFam6";' hgcentraltest ############################################################################## # crispr whole genome (DONE - 2021-05-13 - Hiram) mkdir /hive/data/genomes/canFam6/bed/crisprAll cd /hive/data/genomes/canFam6/bed/crisprAll # the large shoulder argument will cause the entire genome to be scanned # this takes a while for a new genome to get the bwa indexing done time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 -stop=ranges \ canFam6 -tableName=crisprAll \ -fileServer=hgwdev \ -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev) > indexFa.log 2>&1 # real 58m27.340s # that command failed, finished the final command manually, it needed # the gene table on the python command time (~/kent/src/hg/utils/automation/doCrispr.pl -verbose=2 \ -continue=ranges canFam6 -tableName=crisprAll \ -fileServer=hgwdev \ -buildDir=`pwd` -smallClusterHub=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev) > ranges.log 2>&1 # real 8554m11.613s cat guides/run.time | sed -e 's/^/# /;' # Completed: 100 of 100 jobs # CPU time in finished jobs: 10613s 176.88m 2.95h 0.12d 0.000 y # IO & Wait Time: 318s 5.30m 0.09h 0.00d 0.000 y # Average job time: 109s 1.82m 0.03h 0.00d # Longest finished job: 269s 4.48m 0.07h 0.00d # Submission to last job: 272s 4.53m 0.08h 0.00d cat specScores/run.time | sed -e 's/^/# /;' # Completed: 3079460 of 3079460 jobs # CPU time in finished jobs: 246730604s 4112176.74m 68536.28h 2855.68d 7.824 y # IO & Wait Time: 5517084s 91951.39m 1532.52h 63.86d 0.175 y # Average job time: 82s 1.37m 0.02h 0.00d # Longest finished job: 160s 2.67m 0.04h 0.00d # Submission to last job: 276754s 4612.57m 76.88h 3.20d grep "Number of" ranges.log | grep Scores | grep "^#" # Number of specScores: 231915228 # Number of effScores: 253445228 cat effScores/run.time | sed -e 's/^/# /;' # Completed: 25340 of 25340 jobs # CPU time in finished jobs: 13149467s 219157.78m 3652.63h 152.19d 0.417 y # IO & Wait Time: 145678s 2427.97m 40.47h 1.69d 0.005 y # Average job time: 525s 8.74m 0.15h 0.01d # Longest finished job: 97897s 1631.62m 27.19h 1.13d # Submission to last job: 110740s 1845.67m 30.76h 1.28d cat offTargets/run.time | sed -e 's/^/# /;' # Completed: 153974 of 153974 jobs # CPU time in finished jobs: 1850764s 30846.07m 514.10h 21.42d 0.059 y # IO & Wait Time: 1927940s 32132.33m 535.54h 22.31d 0.061 y # Average job time: 25s 0.41m 0.01h 0.00d # Longest finished job: 64s 1.07m 0.02h 0.00d # Submission to last job: 3954s 65.90m 1.10h 0.05d ######################################################################### # all.joiner update, downloads and in pushQ - (WORKING - 2019-11-20 - Hiram) cd $HOME/kent/src/hg/makeDb/schema # verify all the business is done for release ~/kent/src/hg/utils/automation/verifyBrowser.pl canFam6 # 71 tables in database canFam6 - Dog, Canis lupus familiaris # verified 60 tables in database canFam6, 11 extra tables, 19 optional tables # Ensembl genes 5 optional tables # chainNetRBestHg38 3 optional tables # chainNetRBestMm10 3 optional tables # chainNetSynHg38 3 optional tables # chainNetSynMm10 3 optional tables # gapOverlap 1 optional tables # tandemDups 1 optional tables # 1 chainMm39 - extra table # 2 chainMm39Link - extra table # 3 chainRBestMm39 - extra table # 4 chainRBestMm39Link - extra table # . . . etc . . . # 8 crisprAllTargets - extra table # 9 netMm39 - extra table # 10 netRBestMm39 - extra table # 11 netSynMm39 - extra table # 13 genbank tables found # verified 28 required tables, 1 missing tables # 1 ucscToRefSeq - missing table # hg38 chainNet to canFam6 found 3 required tables # mm10 chainNet to canFam6 found 3 required tables # hg38 chainNet RBest and syntenic to canFam6 found 6 optional tables # mm10 chainNet RBest and syntenic to canFam6 found 3 optional tables # liftOver to previous versions: 2, from previous versions: 2 # blatServers: canFam6 blat1b 17907 0 1 canFam6 blat1b 17906 1 0 # fixup all.joiner until this is a clean output joinerCheck -database=canFam6 -tableCoverage all.joiner joinerCheck -database=canFam6 -times all.joiner joinerCheck -database=canFam6 -keys all.joiner # when clean, check in: git commit -m 'adding rules for canFam6 refs #27546' all.joiner git push # run up a 'make alpha' in hg/hgTables to get this all.joiner file # into the hgwdev/genome-test system cd /hive/data/genomes/canFam6 time (makeDownloads.pl canFam6) > downloads.log 2>&1 # real 16m39.750s # now ready for pushQ entry mkdir /hive/data/genomes/canFam6/pushQ cd /hive/data/genomes/canFam6/pushQ time ($HOME/kent/src/hg/utils/automation/makePushQSql.pl -redmineList canFam6) > canFam6.pushQ.sql 2> stderr.out XXX - running - Mon May 24 11:47:51 PDT 2021 # real 15m12.083s # remove the tandemDups and gapOverlap from the file list: sed -i -e "/tandemDups/d" redmine.canFam6.table.list sed -i -e "/Tandem Dups/d" redmine.canFam6.releaseLog.txt sed -i -e "/gapOverlap/d" redmine.canFam6.table.list sed -i -e "/Gap Overlaps/d" redmine.canFam6.releaseLog.txt # check for errors in stderr.out, some are OK, e.g.: # WARNING: canFam6 does not have seq # WARNING: canFam6 does not have extFile # verify the file list does correctly match to files cat redmine.canFam6.file.list | while read L do eval ls $L > /dev/null done # should be silent, missing files will show as errors # verify database tables, how many to expect: wc -l redmine.canFam6.table.list # 63 redmine.canFam6.table.list # how many actual: awk -F'.' '{printf "hgsql -N %s -e '"'"'show table status like \"%s\";'"'"'\n", $1, $2}' redmine.canFam6.table.list | sh | wc -l # 63 # would be a smaller number actual if some were missing # add the path names to the listing files in the redmine issue # in the three appropriate entry boxes: # /hive/data/genomes/canFam6/pushQ/redmine.canFam6.file.list # /hive/data/genomes/canFam6/pushQ/redmine.canFam6.releaseLog.txt # /hive/data/genomes/canFam6/pushQ/redmine.canFam6.table.list -######################################################################### +############################################################################## +# LIFTOVER TO canFam2 (DONE - 2022-12-07 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/canFam6/bed/blat.canFam2.2022-12-07 + cd /hive/data/genomes/canFam6/bed/blat.canFam2.2022-12-07 + doSameSpeciesLiftOver.pl -verbose=2 \ + -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/canFam6/canFam6.2bit \ + -targetSizes=/hive/data/genomes/canFam6/chrom.sizes \ + -query2Bit=/hive/data/genomes/canFam2/canFam2.2bit \ + -querySizes=/hive/data/genomes/canFam2/chrom.sizes \ + -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ + canFam6 canFam2 + time (doSameSpeciesLiftOver.pl -verbose=2 \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + -target2Bit=/hive/data/genomes/canFam6/canFam6.2bit \ + -targetSizes=/hive/data/genomes/canFam6/chrom.sizes \ + -query2Bit=/hive/data/genomes/canFam2/canFam2.2bit \ + -querySizes=/hive/data/genomes/canFam2/chrom.sizes \ + -ooc=/hive/data/genomes/canFam6/jkStuff/canFam6.11.ooc \ + canFam6 canFam2) > doLiftOverToCanFam2.log 2>&1 + # real 2179m29.114s + + # see if the liftOver menus function in the browser from canFam6 to canFam2 + +##############################################################################