3fd174ec070cdd1cd53b276e135c76c869859a4b gperez2 Mon Jun 12 00:19:18 2023 -0700 Making FANTOM5 hub into native track, refs #21605 diff --git src/hg/makeDb/doc/canFam3.txt src/hg/makeDb/doc/canFam3.txt index 6bc195b..0465f34 100644 --- src/hg/makeDb/doc/canFam3.txt +++ src/hg/makeDb/doc/canFam3.txt @@ -1,2053 +1,2074 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the canFam3 # Canis lupus familiaris genome: Nov. 2011 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00 # 7X coverage via a variety of methods # http://www.ncbi.nlm.nih.gov/genome/85 # http://www.ncbi.nlm.nih.gov/bioproject/13179 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX00 # http://www.ncbi.nlm.nih.gov/genome/assembly/317138/ # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAEX03 ############################################################################# # Fetch sequence from genbank (DONE - 2012-01-04 - Hiram) mkdir -p /hive/data/genomes/canFam3/genbank cd /hive/data/genomes/canFam3/genbank wget --timestamping -r --cut-dirs=6 --level=0 -nH -x \ --no-remove-listing -np \ "ftp://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Canis_lupus/CanFam3.1/*" # Downloaded: 293 files, 1.8G in 24m 51s (1.25 MB/s) # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 # lower) in 3267 sequences in 40 files # %0.00 masked total, %0.00 masked real ############################################################################# # process into UCSC naming scheme (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/ucsc cd /hive/data/genomes/canFam3/ucsc cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = ) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = ) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = ) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = ) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl ./toUcsc.pl ./unplaced.pl gzip *.fa *.agp # verify nothing lost in the translation, should be the same as above # except for the name translations faSize *.fa # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 lower) # in 3267 sequences in 40 files # %0.00 masked total, %0.00 masked real # 2410960148 bases (18261639 N's 2392698509 real 2392698509 upper 0 ############################################################################# # Initial browser build (DONE - 2012-01-05 - Hiram) cd /hive/data/genomes/canFam3 cat << '_EOF_' > canFam3.config.ra # Config parameters for makeGenomeDb.pl: db canFam3 clade vertebrate genomeCladePriority 20 scientificName Canis lupus familiaris commonName Dog assemblyDate Sep. 2011 assemblyLabel Broad CanFam3.1 (GCA_000002285.2) assemblyShortLabel Broad CanFam3.1 orderKey 226 mitoAcc NC_002008 fastaFiles /hive/data/genomes/canFam3/ucsc/*.fa.gz agpFiles /hive/data/genomes/canFam3/ucsc/*.agp.gz dbDbSpeciesDir dog taxId 9615 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp canFam3.config.ra > agp.log 2>&1 # less than two minutes # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db canFam3.config.ra > db.log 2>&1 # real 18m17.938s # verify the end of db.log indicates success ############################################################################# # running repeat masker (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/repeatMasker cd /hive/data/genomes/canFam3/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=memk canFam3 > do.log 2>&1 & # real 366m34.586s cat faSize.rmsk.txt # 2410976875 bases (18261639 N's 2392715236 real 1364929603 upper # 1027785633 lower) in 3268 sequences in 1 files # %42.63 masked total, %42.95 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker featureBits -countGaps canFam3 rmsk # 1027963174 bases of 2410976875 (42.637%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/simpleRepeat cd /hive/data/genomes/canFam3/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=memk \ canFam3 > do.log 2>&1 & # real 98m15.932s cat fb.simpleRepeat # 65887025 bases of 2402709449 (2.742%) in intersection # adding RepeatMasker to get completed masked sequence cd /hive/data/genomes/canFam3 twoBitMask canFam3.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed canFam3.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.2bit.txt cat faSize.canFam3.2bit.txt # 2410976875 bases (18261639 N's 2392715236 real 1363595724 upper # 1029119512 lower) in 3268 sequences in 1 files # %42.68 masked total, %43.01 masked real # reset the symlink: rm /gbdb/canFam3/canFam3.2bit ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit # what would it be like to include WindowMasker: zcat bed/windowMasker/cleanWMask.bed.gz \ | twoBitMask -type=.bed -add canFam3.2bit stdin canFam3.wm.trf.rmsk.2bit twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1119889172 upper # 1272826064 lower) in 3268 sequences in 1 files # %52.79 masked total, %53.20 masked real # WM would add 243 million bases masked: 1272826064-1029119512 = 243706552 ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-01-05 - Hiram) mkdir /hive/data/genomes/canFam3/bed/gap cd /hive/data/genomes/canFam3/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../canFam3.unmasked.2bit > findMotif.txt 2>&1 # real 0m35.232s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits canFam3 -not gap -bed=notGap.bed # 2402709449 bases of 2402709449 (100.000%) in intersection # real 0m18.120s time featureBits canFam3 allGaps.bed notGap.bed -bed=new.gaps.bed # 9994213 bases of 2402709449 (0.416%) in intersection # real 0m32.175s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1 # 94 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" canFam3 | sort -n | tail -1`; chomp $ix; open (FH,") { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 19473 featureBits -countGaps canFam3 other.bed # 9994213 bases of 2410976875 (0.415%) in intersection # verify no overlap: time featureBits -countGaps canFam3 gap other.bed # 0 bases of 2410976875 (0.000%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad canFam3 otherGap other.bed # Loaded 19473 elements of size 8 # real 0m29.669s # verify no errors before adding to the table: gapToLift -minGap=1 canFam3 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 0 Jan 6 08:17 before.gapToLift.txt # starting with this many: hgsql -e "select count(*) from gap;" canFam3 # 4403 hgsql canFam3 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" canFam3 # 23876 # == 4403 + 19473 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 canFam3 nonBridged.lift -bedFile=nonBridged.bed # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c # 80 no # 23796 yes ########################################################################## ## WINDOWMASKER (DONE - 2011-09-08 - Hiram) mkdir /hive/data/genomes/canFam3/bed/windowMasker cd /hive/data/genomes/canFam3/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev canFam3 > do.log 2>&1 & # real 135m51.510s # Masking statistics twoBitToFa canFam3.wmsk.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1597393388 upper # 795321848 lower) in 3268 sequences in 1 files # %32.99 masked total, %33.24 masked real twoBitToFa canFam3.wmsk.sdust.2bit stdout | faSize stdin # 2410976875 bases (18261639 N's 2392715236 real 1582594665 upper # 810120571 lower) in 3268 sequences in 1 files # %33.60 masked total, %33.86 masked real hgLoadBed canFam3 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 13647111 elements of size 3 featureBits -countGaps canFam3 windowmaskerSdust # 828382210 bases of 2410976875 (34.359%) in intersection # eliminate the gaps from the masking featureBits canFam3 -not gap -bed=notGap.bed # 2392715236 bases of 2392715236 (100.000%) in intersection time nice -n +19 featureBits canFam3 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 810120571 bases of 2392715236 (33.858%) in intersection # real 2m1.844s # reload track to get it clean hgLoadBed canFam3 windowmaskerSdust cleanWMask.bed.gz # Loaded 13644959 elements of size 4 time featureBits -countGaps canFam3 windowmaskerSdust # 810120571 bases of 2410976875 (33.601%) in intersection # real 1m19.909s # how much overlap with repeat masker: featureBits -countGaps canFam3 rmsk windowmaskerSdust # 565424408 bases of 2410976875 (23.452%) in intersection # mask sequence with this clean result zcat cleanWMask.bed.gz \ | twoBitMask ../../canFam3.unmasked.2bit stdin \ -type=.bed canFam3.cleanWMSdust.2bit twoBitToFa canFam3.cleanWMSdust.2bit stdout | faSize stdin \ > canFam3.cleanWMSdust.faSize.txt cat canFam3.cleanWMSdust.faSize.txt # 2410976875 bases (18261639 N's 2392715236 real 1582594665 upper # 810120571 lower) in 3268 sequences in 1 files # %33.60 masked total, %33.86 masked real ######################################################################### # NOT - MASK SEQUENCE WITH WM+TRF (NOT - 2012-01-06 - Hiram) # not masking this genome with WM+TRF since RepeatMasker has # masked enough cd /hive/data/genomes/canFam3 twoBitMask -add bed/windowMasker/canFam3.cleanWMSdust.2bit \ bed/simpleRepeat/trfMask.bed canFam3.2bit # safe to ignore the warnings about BED file with >=13 fields twoBitToFa canFam3.2bit stdout | faSize stdin > faSize.canFam3.txt cat faSize.canFam3.txt # create symlink to gbdb ssh hgwdev rm /gbdb/canFam3/canFam3.2bit ln -s `pwd`/canFam3.2bit /gbdb/canFam3/canFam3.2bit # what happens with all masks: twoBitMask -add canFam3.2bit bed/repeatMasker/canFam3.sorted.fa.out \ canFam3.wm.trf.rmsk.2bit twoBitToFa canFam3.wm.trf.rmsk.2bit stdout | faSize stdin ######################################################################## # cpgIslands - (DONE - 2011-04-23 - Hiram) mkdir /hive/data/genomes/canFam3/bed/cpgIslands cd /hive/data/genomes/canFam3/bed/cpgIslands time doCpgIslands.pl canFam3 > do.log 2>&1 # real 5m8.626s cat fb.canFam3.cpgIslandExt.txt # 39980778 bases of 2392715236 (1.671%) in intersection ######################################################################### # genscan - (DONE - 2011-04-25 - Hiram) mkdir /hive/data/genomes/canFam3/bed/genscan cd /hive/data/genomes/canFam3/bed/genscan time doGenscan.pl canFam3 > do.log 2>&1 # real 56m58.015s cat fb.canFam3.genscan.txt # 56089579 bases of 2392715236 (2.344%) in intersection cat fb.canFam3.genscanSubopt.txt # 49232090 bases of 2392715236 (2.058%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-05-01 - Hiram) # Use -repMatch=900, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2392715236 / 2897316137 \) \* 1024 # ( 2392715236 / 2897316137 ) * 1024 = 845.658632 # round up to 900 (canFam2 used 852) cd /hive/data/genomes/canFam3 time blat canFam3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/canFam3.11.ooc -repMatch=900 # Wrote 24788 overused 11-mers to jkStuff/canFam3.11.ooc # real 1m11.629s # there are a few non-bridged gaps, make lift file for genbank hgsql -N -e "select bridge from gap;" canFam3 | sort | uniq -c # 80 no # 23796 yes cd /hive/data/genomes/canFam3/jkStuff gapToLift canFam3 canFam3.nonBridged.lift -bedFile=canFam3.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' canFam3.nonBridged.bed | sort -nr | head # 123773608 chrX 95534 123869142 chrX.01 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram) # examine the file: /cluster/data/genomes/genbank/data/genomes/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mus musculus 334577 4853663 26288 # to decide which "native" mrna or ests you want to specify in genbank.conf # of course, canFam3 has plenty of everything ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add canFam3 just after canFam2 and commit to GIT # canFam3 (dog) canFam3.serverGenome = /hive/data/genomes/canFam3/canFam3.2bit canFam3.clusterGenome = /hive/data/genomes/canFam3/canFam3.2bit canFam3.ooc = /hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc canFam3.lift = /hive/data/genomes/canFam3/jkStuff/canFam3.nonBridged.lift canFam3.align.unplacedChroms = chrUn canFam3.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} canFam3.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} canFam3.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} canFam3.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} canFam3.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} canFam3.refseq.mrna.native.load = yes canFam3.refseq.mrna.xeno.load = yes canFam3.genbank.mrna.xeno.load = yes canFam3.downloadDir = canFam3 canFam3.upstreamGeneTbl = ensGene # end of section added to etc/genbank.conf git commit -m "adding definition for canFam3" genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S mm10 # long running job managed in screen cd /cluster/data/genbank # rerun this with canFam3.perChromTables = no in the genbank.conf # 2012-05-24,25 time nice -n +19 ./bin/gbAlignStep -initial canFam3 & # logFile: var/build/logs/2012.05.24-15:31:48.canFam3.initalign.log # second time: about 32 minutes # first time: real 381m29.244s # load data/genomesbase when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad canFam3 & # logFile: var/dbload/hgwdev/logs/2012.05.25-12:52:27.dbload.log # real 125m30.185s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add canFam3 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added canFam3." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # create pushQ entry (DONE - 2012-05-23 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -database=canFam3 -keys all.joiner mkdir /hive/data/genomes/canFam3/pushQ cd /hive/data/genomes/canFam3/pushQ makePushQSql.pl canFam3 > canFam3.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/canFam3/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/canFam3/wib/quality.wib # WARNING: hgwdev does not have /gbdb/canFam3/bbi/quality.bw # WARNING: canFam3 does not have seq # WARNING: canFam3 does not have extFile # which are no real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # # put them in manually after loading the pushQ entry scp -p canFam3.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < canFam3.sql ######################################################################### # LASTZ Cow BosTau7 (DONE - 2012-06-23 - Chin) screen -S bosTau7CanFam3 mkdir /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 cd /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Cow bosTau7 SEQ2_DIR=/hive/data/genomes/bosTau7/bosTau7.2bit SEQ2_LEN=/hive/data/genomes/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=2000 BASE=/hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1528m6.142s cat fb.canFam3.chainBosTau7Link.txt # 1381966556 bases of 2392715236 (57.757%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau7.2012-06-23 lastz.bosTau7 # and the swap mkdir /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau7/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau7.2012-06-23/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 121m44.022s cat fb.bosTau7.chainCanFam3Link.txt # 1456104306 bases of 2804673174 (51.917%) in intersection cd /hive/data/genomes/bosTau7/bed ln -s blastz.canFam3.swap lastz.canFam3 ######################################################################### # LASTZ Cow BosTau6 (DONE - 2012-06-24 - Chin) screen -S bosTau6CanFam3 mkdir /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 cd /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Dog canFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Cow bosTau6 SEQ2_DIR=/hive/data/genomes/bosTau6/bosTau6.2bit SEQ2_LEN=/hive/data/genomes/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=2000 BASE=/hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1392m1.959s cat fb.canFam3.chainBosTau6Link.txt # 1387159926 bases of 2392715236 (57.974%) in intersection # Create link cd /hive/data/genomes/canFam3/bed ln -s lastzBosTau6.2012-06-24 lastz.bosTau6 # and the swap mkdir /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap cd /hive/data/genomes/bosTau6/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzBosTau6.2012-06-24/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 119m54.003s cat fb.bosTau6.chainCanFam3Link.txt # 1399687351 bases of 2649682029 (52.825%) in intersection cd /hive/data/genomes/bosTau6/bed ln -s blastz.canFam3.swap lastz.canFam3 ######################################################################### # swap LASTZ from Human hg19 (DONE - 2012-07-04 - Hiram) # the original alignment cd /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03 cat fb.hg19.chainCanFam3Link.txt # 1502192631 bases of 2897316137 (51.848%) in intersection # and for the swap mkdir /hive/data/genomes/canFam3/bed/blastz.hg19.swap cd /hive/data/genomes/canFam3/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzCanFam3.2012-07-03/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m14.464s cat fb.canFam3.chainHg19Link.txt # 1455183825 bases of 2392715236 (60.817%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################## # construct liftOver to canFam2 (DONE - 2012-11-27,29 - Hiram) screen -S canFam2 # manage this longish running job in a screen mkdir /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27 cd /hive/data/genomes/canFam3/bed/blat.canFam2.2012-11-27 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 > do.log 2>&1 # real 1752m51.425s # two jobs appear to be a problem, running manually on hgwdev in run.chain: ./job.csh part025.lst/canFam2.2bit:chrUn: chainRaw/part025.lst/canFam2.2bit:chrUn:.chain & ./job.csh part026.lst/canFam2.2bit:chrUn: chainRaw/part026.lst/canFam2.2bit:chrUn:.chain wait # real 54m24.882s # the problem is that those axtChain operations use a lot of memory, # for some reason on the kluster nodes, they just slow down and get # stuck at 2 Gb # continuing: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ -continue=net -dbHost=hgwdev -workhorse=hgwdev canFam3 canFam2 \ > net.log 2>&1 # missed the output in the net.log # real 117m45.220s # verify this file exists: og -L /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz # -rw-rw-r-- 1 583607 May 5 02:16 /gbdb/canFam3/liftOver/canFam3ToCanFam2.over.chain.gz # and try out the conversion on genome-test from canFam3 to canFam2 ############################################################################ # LASTZ Chimp PanTro4 (WORKING - 2013-04-30 - Hiram) mkdir /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30 cd /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30 cat << '_EOF_' > DEF # dog vs. chimp # TARGET: Dog CanFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1278m31.863s # no encodek available, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=swarm -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # real 11m42.544s # one chain job failed due to memory limits, finished manually, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=chainMerge `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 & # real 114m33.002s cat fb.canFam3.chainPanTro4Link.txt # 1435182107 bases of 2392715236 (59.981%) in intersection # running the swap - DONE - 2013-05-02 mkdir /hive/data/genomes/panTro4/bed/blastz.canFam3.swap cd /hive/data/genomes/panTro4/bed/blastz.canFam3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzPanTro4.2013-04-30/DEF \ -swap -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 113m28.428s cat fb.panTro4.chainCanFam3Link.txt # 1490574959 bases of 2902338967 (51.358%) in intersection ############################################################################ # create ucscToINSDC name mapping (DONE - 2013-08-16 - Hiram) mkdir /hive/data/genomes/canFam3/bed/ucscToINSDC cd /hive/data/genomes/canFam3/bed/ucscToINSDC # copying these scripts from the previous load and improving them # with each instance ./translateNames.sh ./verifyAll.sh ./join.sh # verify the track link to INSDC functions _EOF_ ############################################################################## # DBSNP B139 / SNP139 (DONE 1/24/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/139/dog cd /hive/data/outside/dbSNP/139/dog # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (dog_9615 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <& do.log & tail -f do.log # Some trial and error is always required to get the config.ra just right. #*** You must account for all 3310 contig_acc values in config.ra, #*** using the liftUp and/or ignoreDbSnpContigsFile settings (see -help output). #*** Check the auto-generated suggested.lft to see if it covers all #*** 3310 contigs; if it does, add 'liftUp suggested.lft' to config.ra. #*** Then run again with -continue=loadDbSnp . wc -l suggested.lft #3310 suggested.lft # ok, looks like the auto-generated suggested.lft will do the trick. cat >> config.ra <>& do.log & tail -f do.log # *** All done! ############################################################################## # FILTER SNP139 (DONE 1/24/14 angie) cd /hive/data/outside/dbSNP/139/dog zcat snp139.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 532567 #Common: 85 #Flagged: 0 #leftover: 3041103 # 85 SNPs that meet the threshold for Common is not enough to warrant a # Common SNPs track. Add only Mult: foreach f ({Mult}.bed.gz) mv $f snp139$f end # Load tables foreach subset (Mult) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ canFam3 snp139$subset -sqlTable=snp139.sql snp139$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (139) (DONE 2/27/14 angie) # Originally done 1/24/14, but Brian L found hgc crash caused by script bug... cd /hive/data/outside/dbSNP/139/dog # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | ~/kent/src/hg/utils/automation/fixNcbiFuncCoords.pl ncbiFuncInsertions.ctg.bed.gz \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiFuncAnnotationsFixed.txt wc -l ncbiFuncAnnotationsFixed.txt #49664 ncbiFuncAnnotationsFixed.txt # How many & what kinds of function types? cut -f 6 ncbiFuncAnnotationsFixed.txt \ | sort -n | uniq -c # 12275 3 (coding-synon) # 24817 8 (cds-reference) # 77 41 (nonsense) # 9018 42 (missense) # 11 43 (stop-loss) # 3461 44 (frameshift) # 5 45 (cds-indel) # 2/27/14: Re-ran from here after fixing bug in script: # Gather up multiple annotation lines into one line per {snp, gene, frame}: ~/kent/src/hg/utils/automation/collectNcbiFuncAnnotations.pl ncbiFuncAnnotationsFixed.txt \ | liftUp snp139CodingDbSnp.bed suggested.lft warn stdin hgLoadBed canFam3 snp139CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp139CodingDbSnp.bed #Read 24824 elements of size 11 from snp139CodingDbSnp.bed ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################## # GENEID GENE PREDICTIONS (DONE - 2015-07-31 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam3/bed/geneid cd /hive/data/genomes/canFam3/bed/geneid wget --timestamping \ http://genome.crg.es/genepredictions/C.familiaris/canFam3/geneid_v1.4/00README wget --timestamping \ http://genome.crg.es/genepredictions/C.familiaris/canFam3/geneid_v1.4/canFam3.geneid.gtf ldHgGene -gtf -genePredExt canFam3 geneid canFam3.geneid.gtf # Read 32342 transcripts in 261967 lines in 1 files # 32342 groups 1737 seqs 1 sources 3 feature types # 32342 gene predictions featureBits -enrichment canFam3 augustusGene:CDS geneid # augustusGene:CDS 1.360%, geneid 1.501%, both 1.082%, cover 79.59%, # enrich 53.04x ########################################################################## # LASTZ chicken galGal4 (DONE - 2015-09-25 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S canFam3GalGal4 mkdir /hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25 cd /hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25 printf "%s\n" \ '# dog vs chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: dog CanFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: chicken GalGal4 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25 TMPDIR=/dev/shm' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 247m1.571s cat fb.canFam3.chainGalGal4Link.txt # 73335538 bases of 2392715236 (3.065%) in intersection time (doRecipBest.pl -buildDir=`pwd` canFam3 galGal4) > rbest.log 2>&1 & # lost the end of rbest.log due to hive difficulties mkdir /hive/data/genomes/galGal4/bed/blastz.canFam3.swap cd /hive/data/genomes/galGal4/bed/blastz.canFam3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzGalGal4.2015-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 6m50.282s cat fb.galGal4.chainCanFam3Link.txt # 65384958 bases of 1032854810 (6.331%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s blastz.canFam3.swap lastz.canFam3 time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galGal4 canFam3) \ > rbest.log 2>&1 # continuing after hive difficulties and finishing the recipBest step # manually time (doRecipBest.pl -workhorse=hgwdev -continue=download -buildDir=`pwd` \ galGal4 canFam3) > download.log 2>&1 # real 0m4.400s ############################################################################## # LASTZ frog xenTro9 (DONE - 2017-04-05 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S canFam3XenTro9 mkdir /hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05 cd /hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05 printf '# dog vs frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: dog CanFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: frog XenTro9 SEQ2_DIR=/hive/data/genomes/xenTro9/xenTro9.2bit SEQ2_LEN=/hive/data/genomes/xenTro9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 661m0.702s # ku difficulties due to /dev/shm/ being full, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=cat -chainMinScore=5000 -chainLinearGap=loose) > cat.log 2>&1 # real 14m3.596s # chain trouble, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose) > chainMerge.log 2>&1 # real 11m28.081s cat fb.canFam3.chainXenTro9Link.txt # 73335538 bases of 2392715236 (3.065%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` canFam3 xenTro9) \ > rbest.log 2>&1 & # real 134m47.527s # and the swap mkdir /hive/data/genomes/xenTro9/bed/blastz.canFam3.swap cd /hive/data/genomes/xenTro9/bed/blastz.canFam3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzXenTro9.2017-04-05/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 11m49.432s cat fb.xenTro9.chainCanFam3Link.txt # 45357837 bases of 1369865365 (3.311%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenTro9 canFam3) \ > rbest.log 2>&1 # real 131m41.477s ############################################################################## # LIFTOVER TO canFam6 (DONE - 2021-05-17 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam3/bed/blat.canFam6.2021-05-17 cd /hive/data/genomes/canFam3/bed/blat.canFam6.2021-05-17 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam6 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam6) > doLiftOverToCanFam6.log 2>&1 # real 364m22.481s # see if the liftOver menus function in the browser from canFam3 to canFam6 ############################################################################## # LIFTOVER TO canFam5 (DONE - 2020-07-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam3/bed/blat.canFam5.2020-07-28 cd /hive/data/genomes/canFam3/bed/blat.canFam5.2020-07-28 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam5 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam5) > doLiftOverToCanFam5.log 2>&1 # real 316m50.048s # see if the liftOver menus function in the browser from canFam3 to canFam5 ############################################################################## # LIFTOVER TO canFam4 (DONE - 2020-04-02 - Hiram) ssh hgwdev mkdir /hive/data/genomes/canFam3/bed/blat.canFam4.2020-04-02 cd /hive/data/genomes/canFam3/bed/blat.canFam4.2020-04-02 doSameSpeciesLiftOver.pl -verbose=2 \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam4 time (doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/canFam3/jkStuff/canFam3.11.ooc \ canFam3 canFam4) > doLiftOverToCanFam4.log 2>&1 # real 1264m58.309s # see if the liftOver menus function in the browser from canFam3 to canFam4 ############################################################################## # LASTZ German shepard canFam4 (DONE - 2020-05-11 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S canFam3CanFam4 mkdir /hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11 cd /hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11 printf '# boxer vs german shepard BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz BLASTZ_M=254 # TARGET: boxer Tasha/canFam3 SEQ1_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ1_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=30 # QUERY: German shepard Mischka/canFam4 SEQ2_DIR=/hive/data/genomes/canFam4/canFam4.2bit SEQ2_LEN=/hive/data/genomes/canFam4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 cat fb.canFam3.chainCanFam4Link.txt # 2362925930 bases of 2392715236 (98.755%) in intersection cat fb.canFam3.chainSynCanFam4Link.txt # 2353029988 bases of 2392715236 (98.341%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ canFam3 canFam4) > rbest.log 2>&1 & # 2318530060 bases of 2392715236 (96.900%) in intersection # real 42m53.973s # and the swap mkdir /hive/data/genomes/canFam4/bed/blastz.canFam3.swap cd /hive/data/genomes/canFam4/bed/blastz.canFam3.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/canFam3/bed/lastzCanFam4.2020-05-11/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 237m57.356s cat fb.canFam4.chainCanFam3Link.txt # 2441363671 bases of 2481941580 (98.365%) in intersection cat fb.canFam4.chainSynCanFam3Link.txt # 2422035026 bases of 2481941580 (97.586%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ canFam4 canFam3) > rbest.log 2>&1 # real 46m50.771s cat fb.canFam4.chainRBest.CanFam3.txt # 2326711360 bases of 2481941580 (93.746%) in intersection ############################################################################## # canFam3 EVA SNPs (European Variation Archive) (DONE 2022-02-27 - b0b) # dbSNP no longer does non-human variants # new EVA release yesterday ! # canFam3.1 = GCA_000002285.2 # get files (EVA Release 3) cd /hive/data/outside/dogSnps/release3 wget https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz # what is .csi file???? # https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz.csi # previous: # 5655125 evaRsIDs # same number in this release # from their pages: # http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/README_rs_ids_counts.txt # Unique RS ID counts # GCA_000002285.2_current_ids.vcf.gz 5599185 # # fewer due to multiple mapping of rsIDs gunzip GCA_000002285.2_current_ids.vcf.gz # fix chromNames chr01 > chr1 cat GCA_000002285.2_current_ids.vcf | sed "s/chr0/chr/" > dogSNPs.vcf bgzip dogSNPs.vcf # vcfToBed requires a tabix tabix -p vcf dogSNPs.vcf.gz # make a bed file with 3 useful flags vcfToBed dogSNPs.vcf.gz dogSNPs3Flags -fields=VC,SID,RS_VALIDATED # VC=Variant Class, SID=Submitter ID, RS_VALIDATED=validated from dbSNP wc -l *bed # 5655126 dogSNPs3Flags.bed wc -l *vcf # 5655174 GCA_000002285.2_current_ids.vcf # sort file bedSort dogSNPs3Flags.bed dogSNPs3Flags.bed head -2 dogSNPs3Flags.bed #chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt FILTER VC SID RS_VALIDATED #chr1 111 112 rs850979046 0 . 111 112 0,0,0 A G . SO:0001483 BROAD_VGB_CANINE_PON_SNP_DISCOVERY # drop unnecessary fields: thickStart thickEnd itemRgb cat dogSNPs3Flags.bed \ | awk -F"\t" '{print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$10"\t"$11"\t"$13"\t"$14"\t"$15}' \ > evaSnps1.bed head evaSnps1.bed #chrom chromStart chromEnd name score strand ref alt VC SID RS_VALIDATED # chr1 111 112 rs850979046 0 . A G SO:0001483 BROAD_VGB_CANINE_PON_SNP_DISCOVERY # add "No" to last column where needed (RS_VALIDATED) cat evaSnps1.bed \ | awk '{if ($11 != "Yes") { print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\tNo"; } \ else { print $1"\t"$2"\t"$3"\t"$4"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11; }}' \ > evaSnps2.bed head evaSnps2.bed #chrom chromStart chromEnd name score strand ref alt VC SID No # chr1 111 112 rs850979046 0 . A G SO:0001483 BROAD_VGB_CANINE_PON_SNP_DISCOVERY No cat evaSnps2.bed | awk -F"\t" '{print $11}' | sort | uniq -c # 4644128 No # 1010998 Yes # replace Sequence Ontology "SO:" IDs with actual snp types cat evaSnps2.bed \ | sed "s/SO:0001483/substitution/" \ | sed "s/SO:0000159/deletion/" \ | sed "s/SO:0000667/insertion/" \ | sed "s/SO:0000705/tandemRepeat/" \ | sed "s/SO:0002007/multipleNucleotideSubstitution/" \ | sed "s/SO:1000032/delins/" > evaSnps3.bed cat evaSnps3.bed | awk '{print $9}' | sort | uniq -c | sort -nr # 4713243 substitution # 474379 deletion # 467490 insertion # 7 tandemRepeat # 5 delins # 1 multipleNucleotideSubstitution # 1 VC # still has header row (VC) head evaSnps3.bed #chrom chromStart chromEnd name score strand ref alt VC SID No # chr1 111 112 rs850979046 0 . A G substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY No # make bigBed bedToBigBed -tab -as=../evaSnp.as -type=bed6+5 -extraIndex=name evaSnps3.bed ../chromInfo.txt evaSnp.bb # error msg: # evaSnpsr.bed is not sorted at line 115. Please use "sort -k1,1 -k2,2n" or bedSort and try again. # ?? but it =looks= sorted there: # j114 chr1 6189 6189 rs851043138 0 . T TAG insertion BROAD_VGB_CANINE_PON_SNP_DISCOVERY No # 115 chr1 6188 6189 rs852166058 0 . T G substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY No # 116 chr1 6193 6194 rs851187136 0 . G A substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY No bedSort evaSnps3.bed evaSnps4.bed # trackDb/ra entry: cd kent/src/hg/makeDb/trackDb/dog/canFam3/trackDb.ra # track evaSnp # shortLabel EVA SNP Release 3 # longLabel Short Genetic Variants from European Variant Archive Release 3 # type bigBed 6 + # group varRep # visibility pack # bigDataUrl /gbdb/canFam3/bbi/evaSnp.bb # url https://www.ebi.ac.uk/eva/?variant&accessionID=$$ # adding search in trackDb.ra (thanks jonathan): # added the following to dog/canFam3/trackDb.ra # searchTable evaSnp # searchType bigBed # searchMethod exact # padding 50 # search works: bedToBigBed -tab -as=evaSnp.as -type=bed6+5 -extraIndex=name evaSnps4.bed chromInfo.txt evaSnp.bb # pass1 - making usageList (39 chroms): 1815 millis # pass2 - checking and writing primary data (5655125 records, 11 fields): 19207 millis # Sorting and writing extra index 0: 4299 millis bigBedInfo evaSnp.bb # version: 4 # fieldCount: 11 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 1 # itemCount: 5,655,125 # primaryDataSize: 78,986,590 # primaryIndexSize: 365,236 # zoomLevels: 10 # chromCount: 39 # basesCovered: 6,322,774 # meanDepth (of bases covered): 1.025209 # minDepth: 1.000000 # maxDepth: 14.000000 # std of depth: 0.217172 # copy to active /gbdb directory for track cp evaSnp.bb /cluster/data/canFam3/bed/evaSnp/evaSnp.bb # make gbdb symlink cd /gbdb/canFam3/bbi ln -s /cluster/data/canFam3/bed/evaSnp/evaSnp.bb evaSnp.bb # submitters cat evaSnps4.bed | awk '{print $10}' | sed -e "s/,/\n/g" | sort | uniq -c | sort -r 3492428 BROAD_VGB_CANINE_PON_SNP_DISCOVERY 2458565 BROAD_DBSNP.2005.2.4.16.57 697053 TIGR_1.0 234166 BROAD_VGB_LYMPHOMA_SNP_DISCOVERY 4202 DOG_GEN_LAB_1032011 94 AMOUCD_MAL_20+TERV_1 ... cat evaSnps4.bed | awk '{print $10}' | sed -e "s/,/\n/g" | sort | uniq -c | sort -r # 52 # 52 different submitters. most from a few places, mostly the Broad ############################################################################## # canFam3 EVA SNPs (European Variation Archive) (DONE 2022-03-21 - b0b) # NOTE: superceded by Lou's pipeline for consistency. # doc'd for multiple assemblies at: evaSnp3.txt # # adding VAI annotations # redoing the track after talking with Lou and standardinzing on data fields after VAI step # canFam3.1 = GCA_000002285.2 # get files (EVA Release 3) cd /hive/data/outside/dogSnps/release3 wget https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz # what is .csi file???? # https://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/GCA_000002285.2_current_ids.vcf.gz.csi # previous: # 5655125 evaRsIDs # same number in this release # from their pages: # http://ftp.ebi.ac.uk/pub/databases/eva/rs_releases/release_3/by_species/canis_lupus_familiaris/GCA_000002285.2/README_rs_ids_counts.txt # Unique RS ID counts # GCA_000002285.2_current_ids.vcf.gz 5599185 # # fewer due to multiple mapping of rsIDs gunzip GCA_000002285.2_current_ids.vcf.gz # fix chromNames chr01 > chr1 cat GCA_000002285.2_current_ids.vcf | sed "s/chr0/chr/" > dogSNPs.vcf bgzip dogSNPs.vcf # vcfToBed requires a tabix tabix -p vcf dogSNPs.vcf.gz # make a bed file with 2 useful flags to match fields agreed w Lou (for his mm39) vcfToBed dogSNPs.vcf.gz dogSNPs2Flags -fields=VC,SID # VC=Variant Class, SID=Submitter ID wc -l dogSNPs2F* # 5655126 dogSNPs2Flags.bed # from earlier: zcat GC*vcf.gz | wc -l # 5655174 # a few extra rows (48) in vcf due to header lines zcat GC*vcf.gz | grep "^#" | wc -l # 49 # accounts for the extra lines (bed has a header line) # sort file sort -k1,1 -k2,2n dogSNPs2Flags.bed > dogSNPs1.bed head dogSNPs1.bed #chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt FILTER VC SID #chr1 111 112 rs850979046 0 . 111 112 0,0,0 A G . SO:0001483 BROAD_VGB_CANINE_PON_SNP_DISCOVERY # what is FILTER FIELD ?? cat dogSNPs1.bed | awk '{print $12}' | sort | uniq -c | sort -nr # 5655125 . # 1 FILTER # will remove #------------- ----------- ---------- # process the SO: terms # also see stand-alone script soTest3.csh set soTerms1=( `cat soTerms.txt | awk '{print $1}'` ) set soTerms2=( `cat soTerms.txt | awk '{print $2}'` ) # echo "bedFile = $bedFile" # echo "soTerms1 = $soTerms1" # echo "soTerms2 = $soTerms2" set bedFile=dogSNPs1.bed # find out if there are any new SO: terms in bedFile # make list of all terms used in current file set bedTermList=`cat $bedFile | awk '{print $13}' | sort -u | grep -v VC` # echo "bedTermList = $bedTermList" set newTerms="" set newTermFlag=0 foreach term ( $bedTermList ) # echo $term grep $term soTerms.txt > /dev/null if ( $status ) then set newTerms=`echo $newTerms $term` set newTermFlag=1 endif end # echo "newTermFlag = $newTermFlag" # echo "newTerms = $newTerms" # if new terms, announce and exit if ( $newTermFlag == 1 ) then echo "\nfound term(s) not in soTerms.txt file: $newTerms " echo "look here to add new term(s) file:" foreach new ( $newTerms ) echo "http://www.sequenceontology.org/browser/current_release/term/$new" end echo "stopping\n" exit endif # build sed file to substitute SO: terms in single pass through $bedFile set outfile = temp$$ set numTerms=`echo $soTerms1 | wc -w` # echo "numTerms = $numTerms" set i=1 set command="" while ( $i <= $numTerms ) set command=`echo $command "sed "\'"s/"$soTerms1[$i]"/"$soTerms2[$i]"/'"" | \\n"` @ i = $i + 1 end # do something with terminal pipe (add cat) set command=`echo $command cat` echo $command > sedFile chmod 700 sedFile # use sedFile to substitute terms in big file cat $bedFile | sedFile > $outfile # rm -f sedFile mv $outfile dogSNPs2.bed cat dogSNPs2.bed | awk '{print $13}' | sort | uniq -c | sort -nr | grep -v VC # 4713243 substitution # 474379 deletion # 467490 insertion # 7 tandem_repeat # 5 delins # 1 multiple_nucleotide_substitution head -2 dogSNPs2.bed #chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt FILTER VC SID #chr1 111 112 rs850979046 0 . 111 112 0,0,0 A G . substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY # compare vcf with new bed file zcat GCA*vcf*.gz | grep -v "^#" | wc -l # 5655125 zcat dog*gz | grep -v "^#" | wc -l # 5655125 # diff = changed chr0 -> chr in the dog*vcf.gz #------------- ----------- ---------- # get VAI info for the variants # break up into 1M chunks zcat dogSNPs.vcf.gz | grep "^#" > top.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | head -1000000 > dog.1.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | head -2000000 | tail -1000000 > dog.2.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | head -3000000 | tail -1000000 > dog.3.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | head -4000000 | tail -1000000 > dog.4.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | head -5000000 | tail -1000000 > dog.5.vcf zcat dogSNPs.vcf.gz | grep -v "^#" | tail -655125 > dog.6.vcf # confirm count was right and no rows missing in last sub-million file # top record in .6.vcf: zcat dogSNPs.vcf.gz | grep -a3 -b3 rs23801983 # finds: rs852500901, bottom record in .5M.vcf as it should # sort and add header section to all: foreach file ( 1 2 3 4 5 6 ) cat top.vcf dog.$file.vcf > dog.${file}M.vcf rm dog.$file.vcf end # sort, add header section to all, zip and tabix and run VAI: foreach file ( 1 2 3 4 5 6 ) echo dog.$file.vcf sort -k1,1 -k2,2n dog.$file.vcf > dog.$file.sorted.vcf cat top.vcf dog.$file.sorted.vcf > dog.${file}M.vcf bgzip dog.${file}M.vcf tabix -p vcf dog.${file}M.vcf.gz nohup vai.pl --genetrack=refGene --variantLimit=2000000 \ --hgvsG=off --hgvsCN=off --hgvsP=off \ canFam3 dog.${file}M.vcf.gz > dogVAI.${file}M.tab rm dog.$file.vcf rm dog.$file.sorted.vcf end # dog.1.vcf # dog.2.vcf # Bad start cookie 58553421 freeing 1ad01458 # dog.3.vcf # dog.4.vcf # dog.5.vcf # dog.6.vcf # ??? # earlier runs complained about sorting chr9 and 10, but that stopped # that could explain why chr9 are missing? # check chroms foreach Mfile ( 1M 2M 3M 4M 5M 6M ) echo dogVAI.$Mfile.tab echo "total = `cat dogVAI.$Mfile.tab | grep -v "^#" | grep -v Variation | wc -l`" cat dogVAI.$Mfile.tab | grep -v "^#" | grep -v Variation \ | awk '{print $2}' | awk -F":" '{print $1}' | sed "s/chr//" | uniq -c echo end # num chrom # # dogVAI.1M.tab # total = 1003545 # 278257 1 # 201889 2 # 221767 3 # 197571 4 # 104061 5 # # dogVAI.2M.tab # total = 667077 # 153949 10 # 125894 5 # 195404 6 # 183370 7 # 8460 8 # # dogVAI.3M.tab # total = 1003876 # 10994 10 # 155516 11 # 171019 12 # 158827 13 # 129042 14 # 148097 15 # 164806 16 # 65575 17 # # dogVAI.4M.tab # total = 1004320 # 96116 17 # 155747 18 # 128985 19 # 142248 20 # 137296 21 # 133613 22 # 133426 23 # 76889 24 # # dogVAI.5M.tab # total = 1003316 # 62388 24 # 142397 25 # 132475 26 # 125810 27 # 118120 28 # 99798 29 # 108964 30 # 115047 31 # 96547 32 # 1770 33 # # dogVAI.6M.tab # total = 656521 # 81867 33 # 122826 34 # 87030 35 # 73781 36 # 86617 37 # 77114 38 # 127286 X # .2M. has too few (600K) ## dogVAI.2M.tab ## total = 667077 # and is missing chr8 (some), 9 and 10 (some?) # check total number of variants by chrom on chroms in .1M., .2M. and .3M. foreach chrom ( 5 6 7 8 9 10 ) echo "chr$chrom `zcat dogSNPs.vcf | grep chr$chrom | wc -l`" echo " `cat dogVAI.1M.tab dogVAI.2M.tab dogVAI.3M.tab | grep chr$chrom | wc -l`" end # chrN vcf # vai # chr5 227782 # 229955 # chr6 195105 # 195404 # chr7 182655 # 183370 # chr8 176807 # 8460 # chr9 166162 # 0 # chr10 164704 # 164943 # so it's only chr8 and chr9 missing some # pull out just those two chroms and process each separately foreach chrom ( 8 9 ) echo "chr$chrom" zcat dogSNPs.vcf | grep -v "^#" | grep chr$chrom > dog.chr$chrom.vcf end wc -l dog.chr* # 176806 dog.chr8.vcf # 166161 dog.chr9.vcf # rerun sort, zip, tabix vai on chr8 and 9 foreach chrom ( chr8 chr9 ) echo dog.$chrom.vcf sort -k1,1 -k2,2n dog.$chrom.vcf > dog.$chrom.sorted.vcf cat top.vcf dog.$chrom.sorted.vcf > dog.$chrom.vcf bgzip dog.$chrom.vcf tabix -p vcf dog.$chrom.vcf.gz nohup vai.pl --genetrack=refGene --variantLimit=2000000 \ --hgvsG=off --hgvsCN=off --hgvsP=off \ canFam3 dog.$chrom.vcf.gz > dogVAI.$chrom.tab rm dog.$chrom.sorted.vcf end # compare original dogSNPs.vcf number with result of the chr-specific files foreach chrom ( 8 9 ) echo "chr$chrom `zcat dogSNPs.vcf | grep chr$chrom | wc -l`" echo " `cat dogVAI.chr8.tab dogVAI.chr9.tab | grep chr$chrom | wc -l`" end # this looks good: # # chr8 176807 # 178750 # chr9 166162 # 166773 # now need to trim out chr8,9 from .2M. to avoid dups og *tab # -rw-rw-r-- 1 69411279 Mar 15 14:57 dogVAI.1M.tab # -rw-rw-r-- 1 46249858 Mar 15 14:57 dogVAI.2M.tab # -rw-rw-r-- 1 69950179 Mar 15 14:58 dogVAI.3M.tab # -rw-rw-r-- 1 70175430 Mar 15 14:58 dogVAI.4M.tab # -rw-rw-r-- 1 70183534 Mar 15 14:58 dogVAI.5M.tab # -rw-rw-r-- 1 45362411 Mar 15 14:59 dogVAI.6M.tab # -rw-rw-r-- 1 12316589 Mar 15 15:46 dogVAI.chr8.tab # -rw-rw-r-- 1 11640462 Mar 15 15:46 dogVAI.chr9.tab # remove chr8,9 from .2M. egrep -v "chr8|chr9" dogVAI.2M.tab > dogVAI.2M.trim.tab mv dogVAI.2M.tab dogVAI.2M.tab.old # these should have everything: og *VAI*tab -rw-rw-r-- 1 69411279 Mar 15 14:57 dogVAI.1M.tab -rw-rw-r-- 1 69950179 Mar 15 14:58 dogVAI.3M.tab -rw-rw-r-- 1 70175430 Mar 15 14:58 dogVAI.4M.tab -rw-rw-r-- 1 70183534 Mar 15 14:58 dogVAI.5M.tab -rw-rw-r-- 1 45362411 Mar 15 14:59 dogVAI.6M.tab -rw-rw-r-- 1 12316589 Mar 15 15:46 dogVAI.chr8.tab -rw-rw-r-- 1 11640462 Mar 15 15:46 dogVAI.chr9.tab -rw-rw-r-- 1 45672614 Mar 16 17:56 dogVAI.2M.trim.tab # cat *VAI*tab | egrep -v "#|Variation" | awk '{print $2}' | awk -F":" '{print $1}' | sort | uniq -c | sort -k2,2 # counts all chroms, but not very useful right here cat *VAI*tab | egrep -v "#|Variation" | wc -l # 5675716 # looks good # combine all the files more or less in order and then sort on second field (chr) cat dogVAI.1M.tab dogVAI.2M.trim.tab dogVAI.chr8.tab dogVAI.chr9.tab dogVAI.3M.tab \ dogVAI.4M.tab dogVAI.5M.tab dogVAI.6M.tab | egrep -v "^#|Varia" | sort -k2,2n > dogVAI.tab wc -l dogVAI.tab # 5675716 dogVAI.tab # which fields to keep ? grep missense dogVAI.tab | head | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$11}' # rs1152388402 chr1:54192143 C missense_variant I/M # rs1152388406 chr3:69456869 T missense_variant C/F # rs1152388411 chr5:42186445 G missense_variant H/R # rs1152388412 chr6:741429 T missense_variant R/H # second and third will not be in final bed but might be useful for troubleshooting #------------- ----------- ---------- # get ready to join bed and tab (vai) files # sort the dogSNPs2.bed on field 4 sort -k 4 dogSNPs2.bed > dogSNPs2.rsSorted.bed head -3 dogSNPs2.rsSorted.bed #chrom chromStart chromEnd name score strand thickStart thickEnd itemRgb ref alt FILTER VC SID #chr1 54192142 54192143 rs1152388402 0 . 54192142 54192143 0,0,0 G C . substitution LDG_DI_UGENT_BE_DOG #chr2 58622672 58622675 rs1152388403 0 . 58622672 58622675 0,0,0 ACT CTAGCTAC . delins LDG_DI_UGENT_BE_DOG tail -2 dogSNPs2.rsSorted.bed #chr37 14758761 14758762 rs9256889 0 . 14758761 14758762 0,0,0 G A . substitution TIGR_1.0,BROAD_DBSNP.2005.2.4.16.57 #chr37 14760548 14760548 rs9256890 0 . 14760548 14760548 0,0,0 A AA . insertion TIGR_1.0 # sort is ok # big join using VAI.sm (with only fields of interest) # and sorted on rsID cat dogVAI.tab | awk '{print $1"\t"$2"\t"$3"\t"$7"\t"$11}' | sort > dogVAI.sm.rsSorted.tab head -3 dogVAI.sm.rsSorted.tab # rs1152388402 chr1:54192143 C missense_variant I/M # rs1152388403 chr2:58622673-58622675 CTAGCTAC frameshift_variant YYWAV...KEAE*(415 tail -3 dogVAI.sm.rsSorted.tab # rs9256888 chr37:14758839 G intergenic_variant - # rs9256889 chr37:14758762 A intergenic_variant - # sort is ok # join on rsID field (4 in bed file, 1 in vai file) # and export # man join: # -e"." flag replaces missing field with . -- can't get it to work # -a1 flag retains items from file 1 even if unpaired # -t"\t" flag sets delimeter as tab (never worked) # found online # -e : tells what to substitute in case of missing fields. does not work without -o. # keep 1-11,13,14 from bed (12 is emply FILTER field) # keep 2-5 from vai (could have used full VAI and saved the shortening step) # could keep 2,7,11 from orig VAI # two fields of VAI*.tab will not be in final bed but might be useful for troubleshooting join -1 4 -2 1 -a1 -e "-" -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 \ 1.10 1.11 1.13 1.14 2.2 2.3 2.4 2.5 dogSNPs2.rsSorted.bed dogVAI.sm.rsSorted.tab \ > join.full.out # repeat with all fields included -- remove FILTER field and chrN:nnnn and vaiAllele fields later join -1 4 -2 1 -a1 -e "-" -o auto dogSNPs2.rsSorted.bed dogVAI.sm.rsSorted.tab \ > join.full.out # many alleles do not match btw eva and VAI cat join.full.out | awk '{print $11, $16}' | sort | uniq -c | sort -n # 1 A AGGCCAAGTCCCCAGTGAAGGAGG # 1 A GGC # 1 A GTGAAG # ... # 13258 GA A # 14152 CA A # 14451 TC C # 14880 TA A # 15647 AC C # 16785 CT T # 17796 AG G # 18448 TG G # 102410 G - # 134016 T - # 138818 A - # 148690 C - # 1146149 C C # 1148246 G G # 1216747 T T # 1217747 A A # rows in that output: cat join.full.out | awk '{print $11, $16}' | sort | uniq -c | sort -n | wc -l # 31612 # will carry on - there are some diffs btw the way EVA and VAI show alleles # make a bed file with just the fields in the evaSnp.as cat join.full.out | awk '{print $2"\t"$3"\t"$4"\t"$1"\t"$5"\t"$6"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$13"\t"$14"\t"$17"\t"$18}' \ | sort -k1,1 -k2,2n > dogSNPs.wVAI.bed # it did not like breaking the line with a \ in the mid of print string head -2 dogSNPs.wVAI.bed # chr1 111 112 rs850979046 0 . 111 112 0,0,0 A G substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY intergenic_variant - # chr1 131 132 rs851217143 0 . 131 132 0,0,0 C A substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY intergenic_variant - tail -2 dogSNPs.wVAI.bed # chrX 123869059 123869060 rs852151786 0 . 123869059 123869060 0,0,0 G A substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY intergenic_variant - # chrX 123869065 123869066 rs850877859 0 . 123869065 123869066 0,0,0 C A substitution BROAD_VGB_CANINE_PON_SNP_DISCOVERY intergenic_variant - cat dogSNPs.wVAI.bed | grep miss | head -2 # chr1 13734813 13734813 rs852746064 0 . 13734813 13734813 0,0,0 C CG insertion BROAD_VGB_CANINE_PON_SNP_DISCOVERY missense_variantD/Y # chr1 13734814 13734814 rs852366799 0 . 13734814 13734814 0,0,0 G GC insertion BROAD_VGB_CANINE_PON_SNP_DISCOVERY missense_variantD/Y cat dogSNPs.wVAI.bed | awk '{print $(NF-1)}' | sort | uniq -c | sort -n # 1 stop_retained_variant # 2 stop_lost # 5 initiator_codon_variant # 12 coding_sequence_variant # 13 complex_transcript_variant # 45 splice_donor_variant # 55 splice_acceptor_variant # 76 stop_gained # 80 inframe_deletion # 83 inframe_insertion # 194 non_coding_transcript_exon_variant # 396 frameshift_variant # 525 5_prime_UTR_variant # 784 NMD_transcript_variant # 1305 splice_region_variant # 1349 no_sequence_alteration # 2225 3_prime_UTR_variant # 3386 missense_variant # 4577 synonymous_variant # 35932 upstream_gene_variant # 37856 downstream_gene_variant # 146403 intron_variant # 5560422 intergenic_variant #------------- ----------- ---------- # substitute itemRgb values # get variant colors set bedFile=dogSNPs.wVAI.bed # build sed file to substitute colors in single pass through $bedFile set varClass=`cat variantColors | egrep "255|128" | awk '{print $1}'` set varColor=`cat variantColors | egrep "255|128" | awk '{print $2}'` set numTerms=`echo $varClass | wc -w` echo "numTerms = $numTerms" # iterate through var classes and get rgb color for substitution set i=1 while ( $i <= $numTerms ) echo "perl -pe "\'"s/(0,0,0)(.*)($varClass[$i])/$varColor[$i]\2\3/"\'" | \\" >> varsedFile @ i = $i + 1 end # terminate last pipe echo "cat" >> varsedFile # substitute itemRgb chmod 700 varsedFile cat $bedFile | varsedFile > dogSNPs.fin.bed # check substitution cat dogSNPs.fin.bed | awk '{print $9,"\t", $14}' | sort | uniq -c | sort -nr # 5560422 0,0,0 intergenic_variant # 146403 0,0,0 intron_variant # 37856 0,0,0 downstream_gene_variant # 35932 0,0,0 upstream_gene_variant # 4577 0,128,0 synonymous_variant # 3386 255,0,0 missense_variant # 2225 0,0,255 3_prime_UTR_variant # 1349 0,0,0 no_sequence_alteration # 1305 255,0,0 splice_region_variant # 784 0,0,0 NMD_transcript_variant # 525 0,0,255 5_prime_UTR_variant # 396 255,0,0 frameshift_variant # 194 0,0,255 non_coding_transcript_exon_variant # 83 255,0,0 inframe_insertion # 80 255,0,0 inframe_deletion # 76 255,0,0 stop_gained # 55 255,0,0 splice_acceptor_variant # 45 255,0,0 splice_donor_variant # 13 0,0,255 complex_transcript_variant # 12 255,0,0 coding_sequence_variant # 5 255,0,0 initiator_codon_variant # 2 255,0,0 stop_lost # 1 0,128,0 stop_retained_variant # check concordance btw EVA and VAI classes cat dogSNPs.fin.bed | awk '{print $12,"\t", $14}' | sort | uniq -c | sort -nr # 4564905 substitution intergenic_variant # 499731 deletion intergenic_variant # 495782 insertion intergenic_variant # 115427 substitution intron_variant # 30498 substitution downstream_gene_variant # 28906 substitution upstream_gene_variant # 15795 deletion intron_variant # 15176 insertion intron_variant # 4571 substitution synonymous_variant # 3778 deletion upstream_gene_variant # 3713 insertion downstream_gene_variant # 3645 deletion downstream_gene_variant # 3366 substitution missense_variant # 3248 insertion upstream_gene_variant # 1682 substitution 3_prime_UTR_variant # 1258 substitution no_sequence_alteration # 860 substitution splice_region_variant # 604 substitution NMD_transcript_variant # 384 substitution 5_prime_UTR_variant # 282 insertion 3_prime_UTR_variant # 261 deletion 3_prime_UTR_variant # 255 deletion splice_region_variant # 196 insertion frameshift_variant # 189 insertion splice_region_variant # 173 deletion frameshift_variant # 168 substitution non_coding_transcript_exon_variant # 117 deletion NMD_transcript_variant # 73 deletion 5_prime_UTR_variant # 70 deletion inframe_deletion # 63 insertion NMD_transcript_variant # 63 insertion 5_prime_UTR_variant # 62 substitution stop_gained # 59 insertion inframe_insertion # 52 insertion no_sequence_alteration # 38 deletion no_sequence_alteration # 31 substitution splice_acceptor_variant # 27 substitution splice_donor_variant # 26 substitution frameshift_variant # 20 deletion splice_acceptor_variant # 14 substitution inframe_insertion # 14 deletion non_coding_transcript_exon_variant # 13 insertion missense_variant # 12 insertion non_coding_transcript_exon_variant # 12 deletion splice_donor_variant # 10 insertion stop_gained # 10 deletion inframe_insertion # 8 insertion complex_transcript_variant # 6 insertion splice_donor_variant # 6 insertion inframe_deletion # 6 insertion coding_sequence_variant # 6 deletion missense_variant # 6 deletion coding_sequence_variant # 5 tandem_repeat intron_variant # 5 delins 5_prime_UTR_variant # 5 deletion complex_transcript_variant # 4 substitution initiator_codon_variant # 4 substitution inframe_deletion # 4 insertion splice_acceptor_variant # 4 deletion stop_gained # 3 tandem_repeat intergenic_variant # 3 insertion synonymous_variant # 3 deletion synonymous_variant # 2 substitution stop_lost # 1 tandem_repeat splice_region_variant # 1 substitution stop_retained_variant # 1 multiple_nucleotide_substitution no_sequence_alteration # 1 multiple_nucleotide_substitution missense_variant # 1 delins intergenic_variant # 1 delins frameshift_variant # 1 deletion initiator_codon_varia # intersting. some classes seem odd: # 3 insertion synonymous_variant # 3 deletion synonymous_variant # 6 insertion inframe_deletion # 10 deletion inframe_insertion # 13 insertion missense_variant # 14 substitution inframe_insertion # 26 substitution frameshift_variant # 38 deletion no_sequence_alteration # 52 insertion no_sequence_alteration #------------- ----------- ---------- # make bigBed bedToBigBed -tab -as=evaSnp.as -type=bed9+6 -extraIndex=name dogSNPs.fin.bed chromInfo.txt evaSnp.bb # pass1 - making usageList (39 chroms): 3107 millis # pass2 - checking and writing primary data (5795726 records, 15 fields): 30701 millis # Sorting and writing extra index 0: 4152 millis bigBedInfo evaSnp.bb # version: 4 # fieldCount: 15 # hasHeaderExtension: yes # isCompressed: yes # isSwapped: 0 # extraIndexCount: 1 # itemCount: 5,795,726 # primaryDataSize: 119,152,623 # primaryIndexSize: 373,584 # zoomLevels: 10 # chromCount: 39 # basesCovered: 6,374,175 # meanDepth (of bases covered): 1.066073 # minDepth: 1.000000 # maxDepth: 45.000000 # std of depth: 0.527133 # move new file to /gbdb cp evaSnp.bb /cluster/data/canFam3/bed/evaSnp/evaSnp.bb # trackDb/ra entry: cd kent/src/hg/makeDb/trackDb/dog/canFam3 # add trackDb rows # itemRgb on # mouseOver $ucscClass $aaChange # change trackDb rows # type bigBed 9 + # change search from padding 50 # padding 20 # set filters # list of ucscClass: cat variantColors | grep . | awk '{print $1","}' # rearranging to useful order (red first, then blue, then green, then black) # # missense_variant, # frameshift_variant, # inframe_deletion, # inframe_insertion, # initiator_codon_variant, # stop_gained, # stop_lost, # splice_acceptor_variant, # splice_donor_variant, # splice_region_variant, # exon_loss_variant, # coding_sequence_variant, # 5_prime_UTR_variant, # 3_prime_UTR_variant, # NMD_transcript_variant, # synonymous_variant, # stop_retained_variant, # complex_transcript_variant, # intron_variant, # non_coding_transcript_exon_variant, # upstream_gene_variant, # downstream_gene_variant, # intergenic_variant, # no_sequence_alteration, # add new filter lines to trackDb # filterValues.varClass (new name for same field -- orig EVA class # filterValues.ucscClass missense_variant,frameshift_variant,inframe_deletion,inframe_insertion,initiator_codon_variant,stop_gained,stop_lost,splice_acceptor_variant,splice_donor_variant,splice_region_variant,exon_loss_variant,coding_sequence_variant,5_prime_UTR_variant,3_prime_UTR_variant,NMD_transcript_variant,synonymous_variant,stop_retained_variant,complex_transcript_variant,non_coding_transcript_exon_variant,intron_variant,upstream_gene_variant,downstream_gene_variant,intergenic_variant,no_sequence_alteration, # drop old filter lines (no longer using RS_VALIDATED field filterValues.valid No,Yes ############################################################################## +# FANTOM5 refs #21605 (2023-06-09 Gerardo) +cd /hive/data/outside/ +mkdir fantom5 +cd fantom5 +hubClone -download https://fantom.gsc.riken.jp/5/datahub/hub.txt +cd /gbdb/canFam3 +mkdir fantom5 +cd fantom5 +# Making symlinks for big files +for file in $(ls /hive/data/outside/fantom5/riken_f5/canFam3/*.b*) ; do ln -s $file; done +cd /hive/data/outside/fantom5/riken_f5/canFam3/ +cp trackDb.txt fantom5.ra +vi fantom5.ra +# Indented subtracks +# Changing bigDataUrl +# Removing non-alpha characters +cd ~/kent/src/hg/makeDb/trackDb/human/canFam3 +cp /hive/data/outside/fantom5/riken_f5/canFam3/fantom5.ra . +vi trackDb.ra +#include fantom5.ra alpha +##############################################################################