6b1648806fc1de0da4b16bb23f51e0fd52065ef7 gperez2 Wed Apr 13 13:12:33 2022 -0700 making ReMap hub into native track, refs #28960 diff --git src/hg/makeDb/doc/dm6.txt src/hg/makeDb/doc/dm6.txt index f04ebd8..9e5655e 100644 --- src/hg/makeDb/doc/dm6.txt +++ src/hg/makeDb/doc/dm6.txt @@ -1,3835 +1,3857 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the dm6 # Drosophila melanogaster -- BDGP Release 6.0 + ISO1 MT (Aug, 2014) # The FlyBase Consortium/Berkeley Drosophila Genome Project/Celera Genomics # BDGP Release 6 + ISO1 MT ############################################################################# # fetch sequence from new style download directory (DONE - 2014-08-27 - Hiram) # NCBI has redesigned their FTP download site, new type of address # and naming scheme mkdir -p /hive/data/genomes/dm6/genbank cd /hive/data/genomes/dm6/genbank rsync -L -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/invertebrate/Drosophila_melanogaster/latest_assembly_versions/GCA_000001215.4_Release_6_plus_ISO1_MT/ ./ # measure what we have here: faSize GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/*.fna.gz \ GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz \ GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly/unplaced_scaffolds/FASTA/*.fna.gz \ GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly/unlocalized_scaffolds/FASTA/*.fna.gz # 143726002 bases (1152978 N's 142573024 real 142573024 upper 0 lower) # in 1870 sequences in 11 files # Total size: mean 76858.8 sd 1382100.2 min 544 (DS485566.2) # max 32079331 (AE014297.3) median 1577 ############################################################################# # fixup to UCSC naming scheme (DONE - 2014-08-27 - Hiram) mkdir /hive/data/genomes/dm6/ucsc cd /hive/data/genomes/dm6/ucsc # evolving these scripts over time, could become source tree scripts cat << '_EOF_' > ucscCompositeAgp.pl #!/bin/env perl use strict; use warnings; my %accToChr; my $primary="../genbank/GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly"; open (FH, "<$primary/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat $primary/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat $primary/assembled_chromosomes/FASTA/chr${chrN}.fna.gz|") or die "can not read chr${chrN}.fna.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; my $primary="../genbank/GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly"; open (FH, "<$primary/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $acc =~ s/\./v/; $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "$primary/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "$primary/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fna.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; $acc =~ s/\./v/; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/ .*//; $acc =~ s/\./v/; $acc =~ s/>//; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $primary="../genbank/GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/Primary_Assembly"; my $agpFile = "$primary/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "$primary/unplaced_scaffolds/FASTA/unplaced.scaf.fna.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\./v/; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/ .*//; $line =~ s/\./v/; $line =~ s/>//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x ucscCompositeAgp.pl unlocalized.pl unplaced.pl ./ucscCompositeAgp.pl ./unlocalized.pl ./unplaced.pl zcat ../genbank/GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | grep -v "^#" | sed -e 's/^KJ947872.2/chrM/' > chrM.agp zcat ../genbank/GCA_000001215.4_Release_6_plus_ISO1_MT_assembly_structure/non-nuclear/assembled_chromosomes/FASTA/chrMT.fna.gz \ | sed -e 's/^>.*/>chrM/' > chrM.fa # verify this sequence is still the same as it was before # these translations faSize *.fa # 143726002 bases (1152978 N's 142573024 real 142573024 upper 0 lower) # in 1870 sequences in 11 files # Total size: mean 76858.8 sd 1382100.2 min 544 (chrUn_DS485566v2) # max 32079331 (chr3R) median 1577 ############################################################################# # pick up photo (DONE - 2014-06-06 - Chin) mkdir /hive/data/genomes/dm6/photo cd /hive/data/genomes/dm6/photo wget --timestamping \ http://upload.wikimedia.org/wikipedia/commons/1/1d/Drosophila_melanogaster.jpg convert -geometry "190x300" Drosophila_melanogaster.jpg New-Drosophila_melanogaster.jpg # check in the New-Drosophila_melanogaster.jpg file into # the source tree: # -rw-rw-r-- 1 9842 Jun 6 14:55 Drosophila_melanogaster.jpg # and get that installed in /usr/local/apache/htdocs/images/ ############################################################################# # Initial database build (DONE - 2014-08-28 - Hiram) cd /hive/data/genomes/dm6 cat << '_EOF_' > dm6.config.ra # Config parameters for makeGenomeDb.pl: db dm6 clade insect genomeCladePriority 10 scientificName Drosophila melanogaster commonName D. melanogaster assemblyDate Aug. 2014 assemblyLabel The FlyBase Consortium/Berkeley Drosophila Genome Project/Celera Genomics assemblyShortLabel BDGP Release 6 + ISO1 MT orderKey 9060 # chrM mito sequence KJ947872.2 included mitoAcc none fastaFiles /hive/data/genomes/dm6/ucsc/chr*.fa agpFiles /hive/data/genomes/dm6/ucsc/chr*.agp # qualFiles /dev/null dbDbSpeciesDir drosophila photoCreditURL http://de.wikipedia.org/wiki/Benutzer:Mr.checker photoCreditName Wikimedia Commons/Mr.checker ncbiGenomeId 47 ncbiAssemblyId 202931 ncbiAssemblyName BDGP Release 6 + ISO1 MT ncbiBioProject 164 genBankAccessionID GCA_000001215.4 taxId 7227 '_EOF_' # << happy emacs # stepwise to verify sequence and AGP file time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -stop=seq dm6.config.ra > seq.log 2>&1 # real 0m9.989s # verify sequence and AGP are OK: time makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev -fileServer=hgwdev \ -continue=agp -stop=agp dm6.config.ra > agp.log 2>&1 # real 0m4.901s # then finish it off: time nice -n +19 makeGenomeDb.pl -workhorse=hgwdev -dbHost=hgwdev \ -fileServer=hgwdev -continue=db dm6.config.ra > db.log 2>&1 # real 1m27.449s ########################################################################## # running repeat masker (DONE - 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/repeatMasker cd /hive/data/genomes/dm6/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=ku dm6 > do.log 2>&1 & # real 44m51.83 cat faSize.rmsk.txt # 143726002 bases (1152978 N's 142573024 real 112610723 upper # 29962301 lower) in 1870 sequences in 1 files # Total size: mean 76858.8 sd 1382100.2 min 544 (chrUn_DS485566v2) # max 32079331 (chr3R) median 1577 # %20.85 masked total, %21.02 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.5 # January 31 2015 (open-4-0-5) version of RepeatMasker # CC RELEASE 20140131; featureBits -countGaps dm6 rmsk # 29968013 bases of 143726002 (20.851%) in intersection # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/simpleRepeat cd /hive/data/genomes/dm6/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=ku \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=ku \ dm6 > do.log 2>&1 & # real 23m17.471s cat fb.simpleRepeat # 3925949 bases of 142573024 (2.754%) in intersection # add to rmsk after it is done: cd /hive/data/genomes/dm6 twoBitMask dm6.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed dm6.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa dm6.2bit stdout | faSize stdin > faSize.dm6.2bit.txt cat faSize.dm6.2bit.txt # 143726002 bases (1152978 N's 142573024 real 112531045 upper # 30041979 # lower) in 1870 sequences in 1 files # Total size: mean 76858.8 sd 1382100.2 min 544 (chrUn_DS485566v2) # max 32079331 (chr3R) median 1577 # %20.90 masked total, %21.07 masked real rm /gbdb/dm6/dm6.2bit ln -s `pwd`/dm6.2bit /gbdb/dm6/dm6.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/gap cd /hive/data/genomes/dm6/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../dm6.unmasked.2bit > findMotif.txt 2>&1 # real 0m1.290s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps dm6 -not gap -bed=notGap.bed # 143726002 bases of 143726002 (100.000%) in intersection featureBits dm6 -not gap -bed=notGap.bed allGaps.bed time featureBits -countGaps dm6 allGaps.bed notGap.bed -bed=new.gaps.bed # 143726002 bases of 143726002 (100.000%) in intersection time featureBits -countGaps dm6 allGaps.bed notGap.bed -bed=new.gaps.bed # 1152978 bases of 143726002 (0.802%) in intersection # there are some very large gaps that are unannotated: awk '{print $3-$2,$0}' new.gaps.bed | sort -rn | head # 53860 chr3L 26952525 27006385 chr3L.7 # 53800 chr3L 26882321 26936121 chr3L.6 # 32000 chrX 22317804 22349804 chrX.5 # the original 'component' AGP files have zero annotations in # the chromosomes. In fact, there are no gaps annotated: hgsql -e 'select count(*) from gap;' dm6 # +----------+ # | count(*) | # +----------+ # | 0 | # +----------+ # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" dm6 | sort -n | tail -1 # (no output, assume 0) cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=0; open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed"; while (my $line = <FH>) { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed featureBits -countGaps dm6 other.bed # 1152978 bases of 143726002 (0.802%) in intersection wc -l other.bed # 572 hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad dm6 otherGap other.bed # Read 572 elements of size 8 from other.bed # starting with this many hgsql -e "select count(*) from gap;" dm6 # 0 hgsql dm6 -e 'load data local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" dm6 # 572 # == 0 + 572 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 dm6 nonBridged.lift -bedFile=nonBridged.bed # see example in danRer7.txt # there are no non-bridged gaps here: hgsql -N -e "select bridge from gap;" dm6 | sort | uniq -c # 572 yes ########################################################################## ## WINDOWMASKER (DONE - 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/windowMasker cd /hive/data/genomes/dm6/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev dm6 > do.log 2>&1 & # real 6m13.489s # Masking statistics cat faSize.dm6.cleanWMSdust.txt # 143726002 bases (1152978 N's 142573024 real 108961589 upper # 33611435 lower) in 1870 sequences in 1 files # Total size: mean 76858.8 sd 1382100.2 min 544 (chrUn_DS485566v2) # max 32079331 (chr3R) median 1577 # %23.39 masked total, %23.57 masked real # how much does this window masker and repeat masker overlap: # if RM finished before this got here, the answer is in: cat fb.dm6.rmsk.windowmaskerSdust.txt # 16809665 bases of 143726002 (11.696%) in intersection ############################################################################# # cytoBandIdeo - (OBSOLETE, redone below 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/cytoBand cd /hive/data/genomes/dm6/bed/cytoBand makeCytoBandIdeo.csh dm6 ######################################################################### # CHROMOSOME BANDS (DONE - 2014-10-09 - Hiram) ssh hgwdev cd /cluster/data/dm6/bed/cytoBand # As of 5.3, the GFF now includes what we need to make cytoBandIdeo # directly: zgrep chromosome_band ../../flyBase/dmel-all-r6.02.gff.gz > chromBand.gff cat << '_EOF_' > gffToBand.pl #!/usr/bin/env perl use warnings; use strict; open (FH, "<chromBand.gff") or die "can not read chromBand.gff"; while (my $line = <FH>) { chomp $line; my @w = split('\t', $line); $w[3]--; $w[3] = 0 if ($w[3] < 0); $w[8] =~ s/^.*Name=band-//; $w[8] =~ s/[;].*$//; if ($w[8] !~ /^\d+[A-Z]\d+$/) { next; } if ($w[4] <= 0) { next; } printf "chr%s\t%d\t%d\t%s\tn/a\n", $w[0], $w[3], $w[4], $w[8]; } close (FH); '_EOF_' # << happy emacs chmod +x gffToBand.pl ./gffToBand.pl > dm6.cytoBand.tab # re-use items from the previously loaded cytoBand data to add # to the ones defined for the 'real' cytoBand data: egrep -v -w "chr2L|chr2R|chr3L|chr3R|chr4|chrX" dm6.cytoBand \ > dm6.combined.bed cat dm6.cytoBand.tab >> dm6.combined.bed featureBits -bed=notCovered.bed -not -countGaps dm6 dm6.combined.bed awk '{printf "%s\t%d\t%d\tn/a\tgneg\n", $1, $2, $3}' notCovered.bed \ >> dm6.combined.bed featureBits -countGaps dm6 dm6.combined.bed maxLen=`awk '{print length($1)}' dm6.combined.bed | sort -rn | head -1` sed -e "s/12/${maxLen}/;" ~/kent/src/hg/lib/cytoBand.sql > dm6.cytoBand.sql hgLoadSqlTab dm6 cytoBand dm6.cytoBand.sql dm6.combined.bed checkTableCoords -verbose=2 dm6 cytoBand hgLoadSqlTab dm6 cytoBandIdeo dm6.cytoBand.sql dm6.combined.bed checkTableCoords -verbose=2 dm6 cytoBandIdeo ########################################################################## # cpgIslands - (DONE - 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/cpgIslands cd /hive/data/genomes/dm6/bed/cpgIslands time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku \ -workhorse=hgwdev -smallClusterHub=ku dm6 > do.log 2>&1 & # real 35m11.446s cat fb.dm6.cpgIslandExt.txt # 15067148 bases of 142573024 (10.568%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/cpgIslandsUnmasked cd /hive/data/genomes/dm6/bed/cpgIslandsUnmasked # run stepwise so the loading can be done in a different table time doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/dm6/dm6.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku dm6 > do.log 2>&1 & # real 40m18.250s cat fb.dm6.cpgIslandExtUnmasked.txt # 21347147 bases of 142573024 (14.973%) in intersection ######################################################################### # genscan - (DONE 2014-08-28 - Hiram) mkdir /hive/data/genomes/dm6/bed/genscan cd /hive/data/genomes/dm6/bed/genscan time doGenscan.pl -buildDir=`pwd` -workhorse=hgwdev -dbHost=hgwdev \ -bigClusterHub=ku dm6 > do.log 2>&1 & # real 35m23.353s cat fb.dm6.genscan.txt # 24277318 bases of 142573024 (17.028%) in intersection cat fb.dm6.genscanSubopt.txt # 676185 bases of 142573024 (0.474%) in intersection ######################################################################## # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE 2014-08-28 - Hiram) # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.96% of human judging by gapless dm1 genome size from # featureBits -- we would use 50, but bump that up to 100 # a bit to be more conservative). # 143726002 bases (1152978 N's 142573024 real 108961589 upper calc \( 142573024 / 2897316137 \) \* 1024 # ( 142573024 / 2897316137 ) * 1024 = 50.389661 # round up to 100 cd /hive/data/genomes/dm6 blat dm6.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/dm6.11.ooc -repMatch=100 # Wrote 4574 overused 11-mers to jkStuff/dm6.11.ooc # there are *only* bridged gaps, no lift file needed for genbank # Ignore the 2 yes non-bridged gap hgsql -N -e "select bridge from gap;" dm6 | sort | uniq -c # 572 yes ######################################################################### # MAKE 11.OOC FILE FOR BLAT (DONE 2014-06-07 - Chin) # dm3 11.ooc is needed to lift flyBase from dm3 to dm6 for encode3 submission # Use -repMatch=100 (based on size -- for human we use 1024, and # fly size is ~4.4% of human judging by gapless dm1 genome size from # featureBits -- we would use 45, but bump that up a bit to be more # conservative). cd /hive/data/genomes/dm3 blat dm3.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/dm3.11.ooc -repMatch=100 # Wrote 7657 overused 11-mers to /hive/data/genomes/dm3/11.ooc ############################################################################# # dm6 LIFTOVER to dm3 (DONE - 2015-01-20 - Hiram) sh hgwdev mkdir /hive/data/genomes/dm6/bed/blat.dm3.2015-01-20 cd /hive/data/genomes/dm6/bed/blat.dm3.2015-01-20 # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -verbose=2 \ -debug -ooc=/hive/data/genomes/dm6/jkStuff/dm6.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ dm6 dm3 # if that appears OK, then running: time doSameSpeciesLiftOver.pl -verbose=2 \ -ooc=/hive/data/genomes/dm6/jkStuff/dm6.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ dm6 dm3 > do.log 2>&1 # about 36 minutes # verify the convert link on the browser is now active from hg19 to hg38 ############################################################################# # dm3 LIFTOVER to dm6 (DONE - 2014-08-28 - Chin) # will move steps to dm3.txt when dm6 is released. sh hgwdev cd /hive/data/genomes/dm3 ln -s jkStuff/dm3.11.ooc 11.ooc # -debug run to create run dir, preview scripts... doSameSpeciesLiftOver.pl -debug dm3 dm6 cd /hive/data/genomes/dm3/bed/blat.dm6.2014-08-28 time nice -n +19 doSameSpeciesLiftOver.pl -verbose=2 \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ dm3 dm6 > do.log 2>&1 & # real 22m18.988s ######################################################################### # LIFTOVER FLYBASE 5.12 ANNOTATIONS FROM DM3 (working 2014-06-09 - Chin) # NOTE: # In order to make flyBase gene ui to work, more steps are needed, other wise, # hgc will shows errors like "Doh, no go for CG3727-RB from flyBasePep" # all the time. Major objevtive here is to create flyBaseGene tables # so that we can do featureBits on it for encode3 submission. ssh hgwdev mkdir /hive/data/genomes/dm6/bed/flybase5.12.liftOver cd /hive/data/genomes/dm6/bed/flybase5.12.liftOver hgsql dm3 -N -e 'select * from flyBaseGene' | cut -f2- > dm3.flyBaseGene.tab liftOver -genePred dm3.flyBaseGene.tab \ /hive/data/genomes/dm3/bed/liftOver/dm3ToDm6.over.chain.gz \ flyBaseLiftGene.tab flyBaseLiftGene.unmapped.tab wc -l *.tab # 21243 dm3.flyBaseGene.tab # 21216 flyBaseLiftGene.tab # 54 flyBaseLiftGene.unmapped.tab ldHgGene -predTab dm6 flyBaseGene flyBaseLiftGene.tab ############################################################################ # set same default position asm dm3 (DONE - 2014-08-28 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr2L:826,001-851,000" where name="dm6";' \ hgcentraltest ############################################################################ # create ucscToINSDC name mapping (DONE - 2014-08-29 - Hiram) mkdir /hive/data/genomes/dm6/bed/ucscToINSDC cd /hive/data/genomes/dm6/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp /hive/data/genomes/rn6/bed/ucscToINSDC/translateNames.sh . # needed to be fixed up here for new genbank organization # to see chrM accession: hgsql -e 'select * from gold;' dm6 | grep chrM # use that accession here: ./translateNames.sh KJ947872.2 awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 22 # use the 22 in this sed: sed -e "s/21/22/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab dm6 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords dm6 ucscToINSDC # should cover all bases featureBits -countGaps dm6 ucscToINSDC # 143726002 bases of 143726002 (100.000%) in intersection ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2014-07-09 - Hiram) export maxLen=`hgsql -N -e 'select frag from gold;' dm6 | awk '{print length($0)}' | sort -run | head -1` echo $maxLen # 10 export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " hgsql -N -e 'select frag from gold;' dm6 | sort -u \ | awk '{ print substr($0,'$C',1) }' | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done # 1: ACDK # 2: EJPS # 3: 049 # 4: 0148 # 5: 34567 # 6: 0123456789 # 7: 0123456789 # 8: 0123456789 # 9: . # 10: 123456 # since we have the composite fragment IDs, there are a variety of # dot versions: hgsql -N -e 'select frag from gold;' dm6 | sed -e 's/.*\.//' \ | sort | uniq -c | sort -rn # 1861 1 # 3 5 # 3 2 # 1 6 # 1 4 # 1 3 # hence, add to trackDb/drosophila/dm6/trackDb.ra searchTable gold searchMethod prefix searchType bed shortCircuit 1 termRegex [ACDK][EJPS][0-9]+(\.[0-9]+)* query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 # test pattern: hgsql -N -e 'select frag from gold;' dm6 | wc -l # 1870 hgsql -N -e 'select frag from gold;' dm6 \ | egrep -e '[ACDK][EJPS][0-9]+(\.[0-9]+)*' | wc -l # 1870 hgsql -N -e 'select frag from gold;' dm6 \ | egrep -v -e '[ACDK][EJPS][0-9]+(\.[0-9]+)*' | wc -l # 0 ############################################################################ # AUTO UPDATE GENBANK (DONE - 2012-05-04 - Hiram) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Drosophila melanogaster 61221 821011 30813 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add before dm3 section:: # dm6 (D. melanogaster) dm6.serverGenome = /hive/data/genomes/dm6/dm6.2bit dm6.clusterGenome = /hive/data/genomes/dm6/dm6.2bit dm6.ooc = /hive/data/genomes/dm6/jkStuff/dm6.11.ooc dm6.lift = no dm6.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} dm6.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} dm6.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} dm6.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} dm6.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} dm6.genbank.mrna.xeno.load = yes # dm6.upstreamGeneTbl = refGene # dm6.upstreamMaf = multiz15way # /hive/data/genomes/dm3/bed/multiz15way/species.lst git commit -m "adding dm6 fly refs #4049" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S dm6 # long running job managed in screen cd /cluster/data/genbank time nice -n +19 ./bin/gbAlignStep -initial dm6 # logFile: var/build/logs/2014.08.29-10:41:04.dm6.initalign.log # real 199m22.304s # load database when finished ssh hgwdev cd /cluster/data/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad dm6 & # logFile: var/dbload/hgwdev/logs/2014.09.03-15:51:49.dm6.dbload.log # real 19m24.163s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add dm6 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added dm6 refs #4049" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # BLATSERVERS ENTRY (DONE - 2014-08-29 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("dm6", "blat4c", "17858", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("dm6", "blat4c", "17859", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################## # LASTZ D. ananassae DroAna3 (DONE 2014-08-29 - Hiram) screen -S droAna3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroAna3.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroAna3.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. ananassae BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. ananassae SEQ2_DIR=/hive/data/genomes/droAna3/droAna3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droAna3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroAna3.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 64m19.065s cat fb.dm6.chainDroAna3Link.txt # 95925289 bases of 142573024 (67.282%) in intersection # running the swap mkdir /hive/data/genomes/droAna3/bed/blastz.dm6.swap cd /hive/data/genomes/droAna3/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroAna3.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 10m43.121s cat fb.droAna3.chainDm6Link.txt # 125698535 bases of 213918817 (58.760%) in intersection ########################################################################### # LASTZ D. sechellia DroSec1 (DONE 2014-08-29 - Hiram) screen -S droSec1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroSec1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroSec1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. sechellia SEQ2_DIR=/hive/data/genomes/droSec1/droSec1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droSec1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroSec1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 47m43.186s cat fb.dm6.chainDroSec1Link.txt # 117709624 bases of 142573024 (82.561%) in intersection # running the swap mkdir /hive/data/genomes/droSec1/bed/blastz.dm6.swap cd /hive/data/genomes/droSec1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroSec1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 9m56.239s cat fb.droSec1.chainDm6Link.txt # 128262529 bases of 157238575 (81.572%) in intersection ########################################################################### # LASTZ D. simulans DroSim1 (DONE 2014-08-29 - Hiram) screen -S droSim1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroSim1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroSim1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. simulans BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. simulans SEQ2_DIR=/hive/data/genomes/droSim1/droSim1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droSim1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroSim1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 39m35.348s cat fb.dm6.chainDroSim1Link.txt # 114117667 bases of 142573024 (80.042%) in intersection # running the swap mkdir /hive/data/genomes/droSim1/bed/blastz.dm6.swap cd /hive/data/genomes/droSim1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroSim1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m50.918s cat fb.droSim1.chainDm6Link.txt # 113676741 bases of 127256433 (89.329%) in intersection ########################################################################### # LASTZ D. erecta DroEre2 (DONE 2014-08-29 - Hiram) screen -S droEre2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroEre2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroEre2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. erecta BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. erecta SEQ2_DIR=/hive/data/genomes/droEre2/droEre2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droEre2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroEre2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 19m31.724s cat fb.dm6.chainDroEre2Link.txt # 114091585 bases of 142573024 (80.023%) in intersection # running the swap mkdir /hive/data/genomes/droEre2/bed/blastz.dm6.swap cd /hive/data/genomes/droEre2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroEre2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 8m35.520s cat fb.droEre2.chainDm6Link.txt # 122430082 bases of 145084019 (84.386%) in intersection ########################################################################### # LASTZ D. grimshawi DroGri2 (DONE 2014-08-29 - Hiram) screen -S droGri2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroGri2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroGri2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. grimshawi BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. grimshawi SEQ2_DIR=/hive/data/genomes/droGri2/droGri2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droGri2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroGri2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 43m54.742s cat fb.dm6.chainDroGri2Link.txt # 72682996 bases of 142573024 (50.979%) in intersection # running the swap mkdir /hive/data/genomes/droGri2/bed/blastz.dm6.swap cd /hive/data/genomes/droGri2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroGri2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m15.091s cat fb.droGri2.chainDm6Link.txt # 79673484 bases of 186090669 (42.814%) in intersection ########################################################################### # LASTZ D. persimilis DroPer1 (DONE 2014-08-29 - Hiram) screen -S droPer1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroPer1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroPer1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. persimilis BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. persimilis SEQ2_DIR=/hive/data/genomes/droPer1/droPer1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droPer1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroPer1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 49m9.697s cat fb.dm6.chainDroPer1Link.txt # 84187839 bases of 142573024 (59.049%) in intersection # running the swap mkdir /hive/data/genomes/droPer1/bed/blastz.dm6.swap cd /hive/data/genomes/droPer1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroPer1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 13m28.190s cat fb.droPer1.chainDm6Link.txt # 92967575 bases of 175583556 (52.948%) in intersection ########################################################################### # LASTZ D. virilis DroVir3 (DONE 2014-08-29 - Hiram) screen -S droVir3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroVir3.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroVir3.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. virilis BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. virilis SEQ2_DIR=/hive/data/genomes/droVir3/droVir3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droVir3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroVir3.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 53m38.503s cat fb.dm6.chainDroVir3Link.txt # 74048111 bases of 142573024 (51.937%) in intersection # running the swap mkdir /hive/data/genomes/droVir3/bed/blastz.dm6.swap cd /hive/data/genomes/droVir3/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroVir3.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m56.701s cat fb.droVir3.chainDm6Link.txt # 82924532 bases of 189205863 (43.828%) in intersection ########################################################################### # LASTZ T. castaneum TriCas2 (DONE 2014-08-29 - Hiram) screen -S triCas2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzTriCas2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzTriCas2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. T. castaneum BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - T. castaneum SEQ2_DIR=/hive/data/genomes/triCas2/triCas2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/triCas2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzTriCas2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 43m49.065s cat fb.dm6.chainTriCas2Link.txt # 20091664 bases of 142573024 (14.092%) in intersection # running the swap mkdir /hive/data/genomes/triCas2/bed/blastz.dm6.swap cd /hive/data/genomes/triCas2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzTriCas2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 1m48.420s cat fb.triCas2.chainDm6Link.txt # 22441942 bases of 151549969 (14.808%) in intersection ########################################################################### # LASTZ A. gambiae AnoGam1 (DONE 2014-08-29 - Hiram) screen -S anoGam1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzAnoGam1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzAnoGam1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. sechellia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - A. gambiae SEQ2_DIR=/hive/data/genomes/anoGam1/anoGam1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/anoGam1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzAnoGam1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 85m43.807s cat fb.dm6.chainAnoGam1Link.txt # 20604473 bases of 142573024 (14.452%) in intersection # running the swap mkdir /hive/data/genomes/anoGam1/bed/blastz.dm6.swap cd /hive/data/genomes/anoGam1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzAnoGam1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 2m1.764s cat fb.anoGam1.chainDm6Link.txt # 22154872 bases of 278268413 (7.962%) in intersection ########################################################################### # LASTZ D. pseudoobscura DroPse3 (DONE 2014-08-29 - Hiram) screen -S droPse3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroPse3.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroPse3.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/hive/data/genomes/droPse3/droPse3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droPse3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroPse3.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 54m34.163s cat fb.dm6.chainDroPse3Link.txt # 85310450 bases of 142573024 (59.836%) in intersection # running the swap mkdir /hive/data/genomes/droPse3/bed/blastz.dm6.swap cd /hive/data/genomes/droPse3/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroPse3.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m27.317s cat fb.droPse3.chainDm6Link.txt # 89188686 bases of 149047912 (59.839%) in intersection ########################################################################### # LASTZ D. willistoni DroWil2 (DONE 2014-08-29 - Hiram) screen -S droWil2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroWil2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroWil2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. willistoni BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. willistoni SEQ2_DIR=/hive/data/genomes/droWil2/droWil2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droWil2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroWil2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 12m46.450s cat fb.dm6.chainDroWil2Link.txt # 73328691 bases of 142573024 (51.432%) in intersection # running the swap mkdir /hive/data/genomes/droWil2/bed/blastz.dm6.swap cd /hive/data/genomes/droWil2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroWil2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m21.824s cat fb.droWil2.chainDm6Link.txt # 90409687 bases of 223610028 (40.432%) in intersection ########################################################################### # LASTZ D. yakuba DroYak3 (DONE 2014-08-29 - Hiram) screen -S droYak3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroYak3.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroYak3.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. yakuba BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. yakuba SEQ2_DIR=/hive/data/genomes/droYak3/droYak3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droYak3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroYak3.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 68m39.923s cat fb.dm6.chainDroYak3Link.txt # 114859096 bases of 142573024 (80.562%) in intersection # running the swap mkdir /hive/data/genomes/droYak3/bed/blastz.dm6.swap cd /hive/data/genomes/droYak3/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroYak3.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 8m13.655s cat fb.droYak3.chainDm6Link.txt # 131647225 bases of 162598019 (80.965%) in intersection ########################################################################### # LASTZ A. mellifera ApiMel4 (DONE 2014-08-29 - Hiram) screen -S apiMel4-dm6 mkdir /hive/data/genomes/dm6/bed/lastzApiMel4.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzApiMel4.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. A. mellifera BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - A. mellifera SEQ2_DIR=/hive/data/genomes/apiMel4/apiMel4.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/apiMel4/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzApiMel4.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 42m58.632s cat fb.dm6.chainApiMel4Link.txt # 11667011 bases of 142573024 (8.183%) in intersection # running the swap mkdir /hive/data/genomes/apiMel4/bed/blastz.dm6.swap cd /hive/data/genomes/apiMel4/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzApiMel4.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 1m19.091s cat fb.apiMel4.chainDm6Link.txt # 12280818 bases of 229123650 (5.360%) in intersection ########################################################################### # LASTZ D. mojavensis DroMoj3 (DONE 2014-08-29 - Hiram) screen -S droMoj3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroMoj3.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroMoj3.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. mojavensis BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. mojavensis SEQ2_DIR=/hive/data/genomes/droMoj3/droMoj3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droMoj3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroMoj3.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 76m3.636s cat fb.dm6.chainDroMoj3Link.txt # 71474162 bases of 142573024 (50.132%) in intersection # running the swap mkdir /hive/data/genomes/droMoj3/bed/blastz.dm6.swap cd /hive/data/genomes/droMoj3/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroMoj3.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 5m40.109s cat fb.droMoj3.chainDm6Link.txt # 74578701 bases of 180207831 (41.385%) in intersection ########################################################################### # LASTZ D. pseudoobscura DP4 (Done 2014-08-29 - Hiram) screen -S dp4-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDP4.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDP4.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. pseudoobscura BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. pseudoobscura SEQ2_DIR=/hive/data/genomes/dp4/dp4.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/dp4/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDP4.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 46m17.831s cat fb.dm6.chainDp4Link.txt # 85549372 bases of 142573024 (60.004%) in intersection # running the swap mkdir /hive/data/genomes/dp4/bed/blastz.dm6.swap cd /hive/data/genomes/dp4/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDP4.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m45.065s cat fb.dp4.chainDm6Link.txt # 88406867 bases of 146066673 (60.525%) in intersection ########################################################################### # LASTZ D. albomicans DroAlb1 (DONE 2014-08-29 - Hiram) screen -S droAlb1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroAlb1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroAlb1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. albomicans BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. albomicans SEQ2_DIR=/hive/data/genomes/droAlb1/droAlb1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droAlb1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroAlb1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 86m58.011s cat fb.dm6.chainDroAlb1Link.txt # 63507363 bases of 142573024 (44.544%) in intersection # running the swap mkdir /hive/data/genomes/droAlb1/bed/blastz.dm6.swap cd /hive/data/genomes/droAlb1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroAlb1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m39.524s cat fb.droAlb1.chainDm6Link.txt # 82594337 bases of 216120500 (38.217%) in intersection ########################################################################### # LASTZ D. biarmipes DroBia2 (DONE 2014-08-29 - Hiram) screen -S droBia2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroBia2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroBia2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. biarmipes BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. biarmipes SEQ2_DIR=/hive/data/genomes/droBia2/droBia2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droBia2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroBia2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 67m19.563s cat fb.dm6.chainDroBia2Link.txt # 106989297 bases of 142573024 (75.042%) in intersection # running the swap mkdir /hive/data/genomes/droBia2/bed/blastz.dm6.swap cd /hive/data/genomes/droBia2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroBia2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m41.031s cat fb.droBia2.chainDm6Link.txt # 113609151 bases of 168601808 (67.383%) in intersection ########################################################################### # LASTZ D. bipectinata DroBip2 (DONE 2014-08-29 - Hiram) screen -S droBip2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroBip2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroBip2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. bipectinata BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. bipectinata SEQ2_DIR=/hive/data/genomes/droBip2/droBip2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droBip2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroBip2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 72m58.502s cat fb.dm6.chainDroBip2Link.txt # 93905122 bases of 142573024 (65.865%) in intersection # running the swap mkdir /hive/data/genomes/droBip2/bed/blastz.dm6.swap cd /hive/data/genomes/droBip2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroBip2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m57.737s cat fb.droBip2.chainDm6Link.txt # 100268467 bases of 166411051 (60.253%) in intersection ########################################################################### # LASTZ D. elegans DroEle2 (DONE 2014-08-29 - Hiram) screen -S droEle2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroEle2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroEle2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. elegans BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. elegans SEQ2_DIR=/hive/data/genomes/droEle2/droEle2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droEle2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroEle2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 52m32.697s cat fb.dm6.chainDroEle2Link.txt # 107563805 bases of 142573024 (75.445%) in intersection # running the swap mkdir /hive/data/genomes/droEle2/bed/blastz.dm6.swap cd /hive/data/genomes/droEle2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroEle2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m37.855s cat fb.droEle2.chainDm6Link.txt # 115222391 bases of 170533638 (67.566%) in intersection ########################################################################### # LASTZ D. eugracilis DroEug2 (DONE 2014-08-29 - Hiram) screen -S droEug2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroEug2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroEug2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. eugracilis BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. eugracilis SEQ2_DIR=/hive/data/genomes/droEug2/droEug2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droEug2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroEug2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 81m4.137s cat fb.dm6.chainDroEug2Link.txt # 107307060 bases of 142573024 (75.265%) in intersection # running the swap mkdir /hive/data/genomes/droEug2/bed/blastz.dm6.swap cd /hive/data/genomes/droEug2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroEug2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m6.555s cat fb.droEug2.chainDm6Link.txt # 114576241 bases of 156328338 (73.292%) in intersection ########################################################################### # LASTZ D. ficusphila DroFic2 (DONE 2014-08-29 - Hiram) screen -S droFic2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroFic2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroFic2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. ficusphila BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. ficusphila SEQ2_DIR=/hive/data/genomes/droFic2/droFic2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droFic2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroFic2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 62m0.114s cat fb.dm6.chainDroFic2Link.txt # 106559757 bases of 142573024 (74.740%) in intersection # running the swap mkdir /hive/data/genomes/droFic2/bed/blastz.dm6.swap cd /hive/data/genomes/droFic2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroFic2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 6m54.593s cat fb.droFic2.chainDm6Link.txt # 112386864 bases of 151050603 (74.403%) in intersection ########################################################################### # LASTZ D. kikkawai DroKik2 (DONE 2014-08-29 - Hiram) screen -S droKik2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroKik2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroKik2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. kikkawai BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. kikkawai SEQ2_DIR=/hive/data/genomes/droKik2/droKik2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droKik2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroKik2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 68m49.238s cat fb.dm6.chainDroKik2Link.txt # 100642052 bases of 142573024 (70.590%) in intersection # running the swap mkdir /hive/data/genomes/droKik2/bed/blastz.dm6.swap cd /hive/data/genomes/droKik2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroKik2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m5.278s cat fb.droKik2.chainDm6Link.txt # 105765944 bases of 163450645 (64.708%) in intersection ########################################################################### # LASTZ D. miranda DroMir2 (DONE 2014-08-29 - Hiram) screen -S droMir2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroMir2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroMir2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. miranda BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. miranda SEQ2_DIR=/hive/data/genomes/droMir2/droMir2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droMir2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroMir2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 46m36.289s cat fb.dm6.chainDroMir2Link.txt # 84557634 bases of 142573024 (59.308%) in intersection # running the swap mkdir /hive/data/genomes/droMir2/bed/blastz.dm6.swap cd /hive/data/genomes/droMir2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroMir2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real real 4m55.434s cat fb.droMir2.chainDm6Link.txt # 84386358 bases of 132586288 (63.646%) in intersection ########################################################################### # LASTZ D. suzukii DroSuz1 (DONE 2014-08-29 - Hiram) screen -S droSuz1-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroSuz1.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroSuz1.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. suzukii BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. suzukii SEQ2_DIR=/hive/data/genomes/droSuz1/droSuz1.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droSuz1/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroSuz1.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 69m12.803s cat fb.dm6.chainDroSuz1Link.txt # 106346119 bases of 142573024 (74.591%) in intersection # running the swap mkdir /hive/data/genomes/droSuz1/bed/blastz.dm6.swap cd /hive/data/genomes/droSuz1/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroSuz1.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 9m7.373s cat fb.droSuz1.chainDm6Link.txt # 129874262 bases of 202303443 (64.198%) in intersection ########################################################################### # LASTZ D. rhopaloa DroRho2 (DONE 2014-08-29 - Hiram) screen -S droRho2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroRho2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroRho2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. rhopaloa BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. rhopaloa SEQ2_DIR=/hive/data/genomes/droRho2/droRho2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droRho2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroRho2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 79m31.589s cat fb.dm6.chainDroRho2Link.txt # 106844545 bases of 142573024 (74.940%) in intersection # running the swap mkdir /hive/data/genomes/droRho2/bed/blastz.dm6.swap cd /hive/data/genomes/droRho2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroRho2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 10m15.734s cat fb.droRho2.chainDm6Link.txt # 126832194 bases of 193889799 (65.415%) in intersection ########################################################################### # LASTZ D. takahashii DroTak2 (DONE 2014-08-29 - Hiram) screen -S droTak2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroTak2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzDroTak2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. D. takahashii BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. takahashii SEQ2_DIR=/hive/data/genomes/droTak2/droTak2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droTak2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroTak2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # real 66m20.654s cat fb.dm6.chainDroTak2Link.txt # 107653248 bases of 142573024 (75.507%) in intersection # running the swap mkdir /hive/data/genomes/droTak2/bed/blastz.dm6.swap cd /hive/data/genomes/droTak2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroTak2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 7m15.787s cat fb.droTak2.chainDm6Link.txt # 114989522 bases of 181031891 (63.519%) in intersection ########################################################################### # LASTZ M. domestica MusDom2 (DONE 2014-08-29 - Hiram) screen -S musDom2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzMusDom2.2014-08-29 cd /hive/data/genomes/dm6/bed/lastzMusDom2.2014-08-29 cat << '_EOF_' > DEF # D. melanogaster vs. M. domestica BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - M. domestica SEQ2_DIR=/hive/data/genomes/musDom2/musDom2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/musDom2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzMusDom2.2014-08-29 TMPDIR=/dev/shm '_EOF_' # << happy emacs time doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > do.log 2>&1 # Elapsed time: 81m30s cat fb.dm6.chainMusDom2Link.txt # 27485718 bases of 142573024 (19.278%) in intersection # running the swap mkdir /hive/data/genomes/musDom2/bed/blastz.dm6.swap cd /hive/data/genomes/musDom2/bed/blastz.dm6.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzMusDom2.2014-08-29/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku > swap.log 2>&1 # real 3m35.469s cat fb.musDom2.chainDm6Link.txt # 33622643 bases of 691751600 (4.861%) in intersection ######################################################################### ## 27-Way Multiz (DONE - 2014-09-05 - Hiram) ssh hgwdev mkdir /hive/data/genomes/dm6/bed/multiz27way cd /hive/data/genomes/dm6/bed/multiz27way # worked up a phylo tree with the help of # http://phylot.biobyte.de/ # http://itol.embl.de/ # and editor: http://treegraph.bioinfweb.info/ cp -p ~/kent/src/hg/utils/phyloTrees/dm6.27way.nh ./ # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl dm6.27way.nh (((((((((((((((((dm6:0.1, (droSim1:0.1, droSec1:0.1):0.1):0.1, (droYak3:0.1, droEre2:0.1):0.1):0.1, (droBia2:0.1, droSuz1:0.1):0.1):0.1, (droAna3:0.1, droBip2:0.1):0.1):0.1, droEug2:0.1):0.1, droEle2:0.1):0.1, droKik2:0.1):0.1, droTak2:0.1):0.1, droRho2:0.1):0.1, droFic2:0.1):0.1, ((droPse3:0.1, droPer1:0.1):0.1, droMir2:0.1):0.1):0.1, droWil2:0.1):0.1, (((droVir3:0.1, droMoj3:0.1):0.1, droAlb1:0.1):0.1, droGri2:0.1):0.1):0.1, musDom2:0.1):0.1, anoGam1:0.1):0.1, apiMel4:0.1):0.1, triCas2:0.1); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ dm6.27way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to common name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ > db.to.name.txt # construct db to scientific name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,scientificName from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e 's/pseudoobscura_pseudoobscura/pseudoobscura/;' \ > db.to.sciName.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" dm6.27way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > dm6.27way.commonNames.nh # construct a scientific name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.sciName.txt`" dm6.27way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > dm6.27way.scientificNames.nh # construct a scientific name .nh file: cat dm6.27way.scientificNames.nh (((((((((((((((((Drosophila_melanogaster:0.1, (Drosophila_simulans:0.1, Drosophila_sechellia:0.1):0.1):0.1, (Drosophila_yakuba:0.1, Drosophila_erecta:0.1):0.1):0.1, (Drosophila_biarmipes:0.1, Drosophila_suzukii:0.1):0.1):0.1, (Drosophila_ananassae:0.1, Drosophila_bipectinata:0.1):0.1):0.1, Drosophila_eugracilis:0.1):0.1, Drosophila_elegans:0.1):0.1, Drosophila_kikkawai:0.1):0.1, Drosophila_takahashii:0.1):0.1, Drosophila_rhopaloa:0.1):0.1, Drosophila_ficusphila:0.1):0.1, ((Drosophila_pseudoobscura:0.1, Drosophila_persimilis:0.1):0.1, Drosophila_miranda:0.1):0.1):0.1, Drosophila_willistoni:0.1):0.1, (((Drosophila_virilis:0.1, Drosophila_mojavensis:0.1):0.1, Drosophila_albomicans:0.1):0.1, Drosophila_grimshawi:0.1):0.1):0.1, Musca_domestica:0.1):0.1, Anopheles_gambiae:0.1):0.1, Apis_mellifera:0.1):0.1, Tribolium_castaneum:0.1); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/dm6_27way.png /cluster/bin/phast/all_dists dm6.27way.nh | grep dm6 \ | sed -e "s/dm6.//" | sort -k2n > 27way.distances.txt # Use this output to create the table below head 27way.distances.txt droSec1 0.300000 droSim1 0.300000 droEre2 0.400000 droYak3 0.400000 droBia2 0.500000 droSuz1 0.500000 droAna3 0.600000 droBip2 0.600000 droEug2 0.600000 droEle2 0.700000 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<27way.distances.txt") or die "can not read 27way.distances.txt"; my $count = 0; while (my $line = <FH>) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/dm6/bed/lastz.$D/fb.dm6." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.dm6/fb.${D}.chainDm6Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on dm6 on other other species # 01 0.3000 (% 82.561) (% 81.572) - D. sechellia droSec1 # 02 0.3000 (% 80.042) (% 89.329) - D. simulans droSim1 # 03 0.4000 (% 80.023) (% 84.386) - D. erecta droEre2 # 04 0.4000 (% 80.562) (% 80.965) - D. yakuba droYak3 # 05 0.5000 (% 75.042) (% 67.383) - D. biarmipes droBia2 # 06 0.5000 (% 74.591) (% 64.198) - D. suzukii droSuz1 # 07 0.6000 (% 67.282) (% 58.760) - D. ananassae droAna3 # 08 0.6000 (% 65.865) (% 60.253) - D. bipectinata droBip2 # 09 0.6000 (% 75.265) (% 73.292) - D. eugracilis droEug2 # 10 0.7000 (% 75.445) (% 67.566) - D. elegans droEle2 # 11 0.8000 (% 70.590) (% 64.708) - D. kikkawai droKik2 # 12 0.9000 (% 75.507) (% 63.519) - D. takahashii droTak2 # 13 1.0000 (% 74.940) (% 65.415) - D. rhopaloa droRho2 # 14 1.1000 (% 74.740) (% 74.403) - D. ficusphila droFic2 # 15 1.3000 (% 59.308) (% 63.646) - D. miranda droMir2 # 16 1.3000 (% 51.432) (% 40.432) - D. willistoni droWil2 # 17 1.4000 (% 59.049) (% 52.949) - D. persimilis droPer1 # 18 1.4000 (% 59.836) (% 59.839) - D. pseudoobscura droPse3 # 19 1.5000 (% 50.979) (% 42.814) - D. grimshawi droGri2 # 20 1.5000 (% 19.278) (% 04.861) - M. domestica musDom2 # 21 1.6000 (% 14.452) (% 07.962) - A. gambiae anoGam1 # 22 1.6000 (% 44.544) (% 38.217) - D. albomicans droAlb1 # 23 1.7000 (% 08.183) (% 05.360) - A. mellifera apiMel4 # 24 1.7000 (% 50.132) (% 41.385) - D. mojavensis droMoj3 # 25 1.7000 (% 51.937) (% 43.828) - D. virilis droVir3 # 26 1.8000 (% 14.092) (% 14.808) - T. castaneum triCas2 # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ dm6.27way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # dm6 droSim1 droSec1 droYak3 droEre2 droBia2 droSuz1 droAna3 droBip2 droEug2 # droEle2 droKik2 droTak2 droRho2 droFic2 droPse3 droPer1 droMir2 droWil2 # droVir3 droMoj3 droAlb1 droGri2 musDom2 anoGam1 apiMel4 triCas2 # bash shell syntax here ... cd /hive/data/genomes/dm6/bed/multiz27way export H=/hive/data/genomes/dm6/bed mkdir mafLinks # want syntenic net for all the Drosophilas for G in droAlb1 droAna3 droBia2 droBip2 droEle2 droEre2 droEug2 droFic2 droGri2 droKik2 droMir2 droMoj3 droPer1 droPse3 droRho2 droSec1 droSim1 droSuz1 droTak2 droVir3 droWil2 droYak3 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/axtChain/dm6.${G}.synNet.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/axtChain/dm6.${G}.synNet.maf.gz ./mafLinks/$G done # and basic maf net for the others: for G in anoGam1 apiMel4 musDom2 triCas2 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/mafNet/dm6.${G}.net.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafNet/dm6.${G}.net.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogrtL mafLinks/*/* # -rw-rw-r-- 1 57989657 Aug 29 11:52 mafLinks/droAna3/dm6.droAna3.synNet.maf.gz # -rw-rw-r-- 1 66653928 Aug 29 13:31 mafLinks/droEre2/dm6.droEre2.synNet.maf.gz # -rw-rw-r-- 1 55414479 Aug 29 13:48 mafLinks/droSim1/dm6.droSim1.synNet.maf.gz # -rw-rw-r-- 1 64261526 Aug 29 13:55 mafLinks/droSec1/dm6.droSec1.synNet.maf.gz # -rw-rw-r-- 1 43326252 Aug 29 13:58 mafLinks/droGri2/dm6.droGri2.synNet.maf.gz # -rw-rw-r-- 1 14470342 Aug 29 14:03 mafLinks/triCas2/dm6.triCas2.net.maf.gz # -rw-rw-r-- 1 53914019 Aug 29 14:06 mafLinks/droPer1/dm6.droPer1.synNet.maf.gz # -rw-rw-r-- 1 8871501 Aug 29 14:09 mafLinks/apiMel4/dm6.apiMel4.net.maf.gz # -rw-rw-r-- 1 44299779 Aug 29 14:12 mafLinks/droVir3/dm6.droVir3.synNet.maf.gz # -rw-rw-r-- 1 56305568 Aug 29 14:19 mafLinks/droPse3/dm6.droPse3.synNet.maf.gz # -rw-rw-r-- 1 58119972 Aug 29 14:31 mafLinks/droMir2/dm6.droMir2.synNet.maf.gz # -rw-rw-r-- 1 68523213 Aug 29 14:31 mafLinks/droEle2/dm6.droEle2.synNet.maf.gz # -rw-rw-r-- 1 68100446 Aug 29 14:35 mafLinks/droYak3/dm6.droYak3.synNet.maf.gz # -rw-rw-r-- 1 64321608 Aug 29 14:43 mafLinks/droBia2/dm6.droBia2.synNet.maf.gz # -rw-rw-r-- 1 44761222 Aug 29 14:43 mafLinks/droWil2/dm6.droWil2.synNet.maf.gz # -rw-rw-r-- 1 68869400 Aug 29 14:44 mafLinks/droFic2/dm6.droFic2.synNet.maf.gz # -rw-rw-r-- 1 42864649 Aug 29 14:45 mafLinks/droMoj3/dm6.droMoj3.synNet.maf.gz # -rw-rw-r-- 1 14470901 Aug 29 14:47 mafLinks/anoGam1/dm6.anoGam1.net.maf.gz # -rw-rw-r-- 1 61398493 Aug 29 14:51 mafLinks/droBip2/dm6.droBip2.synNet.maf.gz # -rw-rw-r-- 1 66734812 Aug 29 14:52 mafLinks/droKik2/dm6.droKik2.synNet.maf.gz # -rw-rw-r-- 1 67398540 Aug 29 14:54 mafLinks/droTak2/dm6.droTak2.synNet.maf.gz # -rw-rw-r-- 1 60781324 Aug 29 14:54 mafLinks/droSuz1/dm6.droSuz1.synNet.maf.gz # -rw-rw-r-- 1 28941891 Aug 29 15:01 mafLinks/droAlb1/dm6.droAlb1.synNet.maf.gz # -rw-rw-r-- 1 69660184 Aug 29 15:02 mafLinks/droEug2/dm6.droEug2.synNet.maf.gz # -rw-rw-r-- 1 61713626 Aug 29 15:06 mafLinks/droRho2/dm6.droRho2.synNet.maf.gz # -rw-rw-r-- 1 19994083 Aug 29 15:08 mafLinks/musDom2/dm6.musDom2.net.maf.gz # split the maf files into a set of hashed named files # this hash named split keeps the same chr/contig names in the same # named hash file. mkdir /hive/data/genomes/dm6/bed/multiz27way/mafSplit cd /hive/data/genomes/dm6/bed/multiz27way/mafSplit for D in `sed -e "s/dm6 //" ../species.list` do echo "${D}" mkdir $D cd $D echo "mafSplit -byTarget -useHashedName=8 /dev/null . ../../mafLinks/${D}/*.maf.gz" mafSplit -byTarget -useHashedName=8 /dev/null . \ ../../mafLinks/${D}/*.maf.gz cd .. done # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | wc -l # 678 find . -type f | grep ".maf$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 161 maf.list mkdir /hive/data/genomes/dm6/bed/multiz27way/splitRun cd /hive/data/genomes/dm6/bed/multiz27way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = dm6 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/dm6/bed/multiz27way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/dm6/bed/multiz27way/splitRun/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh ku cd /hive/data/genomes/dm6/bed/multiz27way/splitRun/run gensub2 maf.list single template stdout > jobList para -ram=8g create jobList # Completed: 161 of 161 jobs # CPU time in finished jobs: 172027s 2867.12m 47.79h 1.99d 0.005 y # IO & Wait Time: 1065s 17.75m 0.30h 0.01d 0.000 y # Average job time: 1075s 17.92m 0.30h 0.01d # Longest finished job: 42777s 712.95m 11.88h 0.50d # Submission to last job: 42793s 713.22m 11.89h 0.50d # combine into one file (the 1>&2 redirect sends the echo to stderr) cd /hive/data/genomes/dm6/bed/multiz27way head -1 splitRun/maf/053.maf > multiz27way.maf for F in splitRun/maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} done >> multiz27way.maf tail -1 splitRun/maf/053.maf >> multiz27way.maf # -rw-rw-r-- 1 5808772759 Sep 5 15:14 multiz27way.maf # Load into database ssh hgwdev cd /hive/data/genomes/dm6/bed/multiz27way mkdir /gbdb/dm6/multiz27way ln -s `pwd`/multiz27way.maf /gbdb/dm6/multiz27way cd /dev/shm time hgLoadMaf dm6 multiz27way # Loaded 2110160 mafs in 1 files from /gbdb/dm6/multiz27way # real 5m42.300s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 dm6 multiz27waySummary \ /gbdb/dm6/multiz27way/multiz27way.maf # Created 157893 summary blocks from 34026658 components and 2110160 mafs from /gbdb/dm6/multiz27way/multiz27way.maf # real 2m44.856s wc -l multiz27way*.tab # 2110160 multiz27way.tab # 157893 multiz27waySummary.tab # 2268053 total -rw-rw-r-- 1 102915136 Sep 5 15:20 multiz27way.tab -rw-rw-r-- 1 7215723 Sep 5 15:35 multiz27waySummary.tab rm multiz27way*.tab ####################################################################### # GAP ANNOTATE MULTIZ27WAY MAF AND LOAD TABLES (DONE - 2014-11-13 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/dm6/bed/multiz27way/anno/mafSplit cd /hive/data/genomes/dm6/bed/multiz27way/anno/mafSplit time mafSplit -byTarget -useFullSequenceName \ /dev/null . ../../multiz27way.maf ls | wc # 277 277 6007 cd /hive/data/genomes/dm6/bed/multiz27way/anno # skipped making the N.bed files - all dbs should have one already # check for N.bed files everywhere: for DB in `cat ../species.list.txt` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi cd /hive/data/genomes/dm6/bed/multiz27way/anno done for DB in `cat ../species.list.txt` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL cd /hive/data/genomes/dm6/bed/multiz27way/anno mkdir result cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/dm6/dm6.2bit {check out exists+ result/$(file1)} #ENDLOOP '_EOF_' # << happy emacs ls mafSplit/*.maf > maf.list ssh ku cd /hive/data/genomes/dm6/bed/multiz27way/anno gensub2 maf.list single template jobList para create jobList para try ... check ... etc para push # Completed: 276 of 277 jobs # Other count: 1 jobs # CPU time in finished jobs: 285s 4.75m 0.08h 0.00d 0.000 y # IO & Wait Time: 4730s 78.84m 1.31h 0.05d 0.000 y # Average job time: 18s 0.30m 0.01h 0.00d # Longest finished job: 103s 1.72m 0.03h 0.00d # Submission to last job: 195s 3.25m 0.05h 0.00d # one job was completed manually due to ku problems # combine into one file head -q -n 1 result/chr2L.maf > dm6.27way.maf for F in result/*.maf do grep -h -v "^#" ${F} done >> dm6.27way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/chr2L.maf > dm6.27way.maf # tail -q -n 1 result/GL172637.maf >> dm6.27way.maf # How about an official end marker: echo "##eof maf" >> dm6.27way.maf # Load into database rm /gbdb/dm6/multiz27way/multiz27way.maf # remove old symlink ln -s `pwd`/dm6.27way.maf /gbdb/dm6/multiz27way/multiz27way.maf cd /dev/shm time nice -n +19 hgLoadMaf dm6 multiz27way # real 1m43.417s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 dm6 multiz27waySummary \ /gbdb/dm6/multiz27way/multiz27way.maf # Created 157893 summary blocks from 34026658 components and 2115903 mafs # from /gbdb/dm6/multiz27way/multiz27way.maf # real 2m41.434s wc -l multiz27way*.tab # 2115903 multiz27way.tab # 157893 multiz27waySummary.tab # 2273796 total rm multiz27way*.tab ######################################################################### # multiz27way MAF FRAMES (DONE - 2014-11-13 - Hiram) ssh hgwdev mkdir /hive/data/genomes/dm6/bed/multiz27way/frames cd /hive/data/genomes/dm6/bed/multiz27way/frames mkdir genes cat << '_EOF_' > showGenes.sh #!/bin/sh cat ../species.list.txt | while read db do echo -n "${db}: " hgsql $db -N -e "show tables like '%Gene%'" | while read table do if [ $table = "ensGene" -o $table = "refGene" -o \ $table = "mgcGenes" -o $table = "knownGene" -o \ $table = "xenoRefGene" -o $table = "flyBaseGene" ]; then count=`hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " else echo -n "${table}: n/a, " fi done orgName=`hgsql hgcentraltest -N -e "select scientificName from dbDb where name='$db'"` orgId=`hgsql dm6 -N -e "select id from organism where name='$orgName'"` if [ "x${orgId}y" = "xy" ]; then echo "Mrnas: 0" else count=`hgsql dm6 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" fi done '_EOF_' # << happy emacs chmod +x showGenes.sh ./showGenes.sh > survey.txt # Getting numbers of items in gene tracks for each of the assemblies # to pick which is best gene set to use: cat survey.txt dm6: flyBaseGene: 30264, flyBaseGenePep: n/a, refGene: 36039, xenoRefGene: 49470, Mrnas: 907097 droSim1: dmRefGene: n/a, xenoRefGene: 251522, Mrnas: 341 droSec1: xenoRefGene: 222725, Mrnas: 17 droYak3: Mrnas: 815 droEre2: ensGene: 15902, ensemblToGeneName: n/a, Mrnas: 42 droBia2: Mrnas: 0 droSuz1: Mrnas: 1 droAna3: ensGene: 16061, ensemblToGeneName: n/a, Mrnas: 30 droBip2: Mrnas: 3 droEug2: Mrnas: 5 droEle2: Mrnas: 1 droKik2: Mrnas: 2 droTak2: Mrnas: 1 droRho2: Mrnas: 4 droFic2: Mrnas: 1 droPse3: Mrnas: 18 droPer1: xenoRefGene: 220792, Mrnas: 34 droMir2: Mrnas: 2 droWil2: Mrnas: 20 droVir3: Mrnas: 84 droMoj3: Mrnas: 25 droAlb1: Mrnas: 1 droGri2: Mrnas: 10 musDom2: Mrnas: 6447 anoGam1: ensGene: 15802, xenoRefGene: 106775, Mrnas: 64135 apiMel4: Mrnas: 121235 triCas2: Mrnas: 1376 # therefore: # refGene - dm6 # ensGene - droEre2 droAna3 anoGam1 #------------------------------------------------------------------------ # get the genes for all genomes # mRNAs with CDS. single select to get cds+psl, then split that up and # create genePred # using ensGene for: cavPor3 panTro4 canFam3 ailMel1 monDom5 galGal4 melGal1 for qDB in droEre2 droAna3 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} | cut -f 2-11 \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # seems to be an older ensGene here, no 'bin' column: for qDB in anoGam1 do echo hgsql -N -e \"'select * from 'ensGene\;\" ${qDB} hgsql -N -e "select * from ensGene" ${qDB} \ | genePredSingleCover stdin stdout | gzip -2c > genes/$qDB.gp.gz done # Using refGene for: dm6 - want the full extended genePred: for qDB in dm6 do hgsql -N -e "select * from refGene" ${qDB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/$qDB.gp.gz echo "${qDB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/anoGam1.gp.gz: 14267 # genes/dm6.gp.gz: 13823 # genes/droAna3.gp.gz: 15022 # genes/droEre2.gp.gz: 14992 ssh hgwdev cd /hive/data/genomes/dm6/bed/multiz27way/frames time (cat ../multiz27way.maf \ | genePredToMafFrames dm6 stdin stdout \ dm6 genes/dm6.gp.gz droAna3 genes/droAna3.gp.gz \ droEre2 genes/droEre2.gp.gz anoGam1 genes/anoGam1.gp.gz \ | gzip > multiz27way.mafFrames.gz) # real 2m22.546s # verify there are frames on everything: zcat multiz27way.mafFrames.gz | awk '{print $4}' | sort | uniq -c # 164994 anoGam1 # 55050 dm6 # 118998 droAna3 # 77171 droEre2 ssh hgwdev cd /hive/data/genomes/dm6/bed/multiz27way/frames time hgLoadMafFrames dm6 multiz27wayFrames multiz27way.mafFrames.gz # real 0m4.457s ############################################################################## # construct downloads files and pushQ entry (DONE - 2014-10-14 - Hiram) # before starting downloads, the joinerCheck should be clean # after dm6 is added to all.joiner: joinerCheck -tableCoverage -database=dm6 all.joiner joinerCheck -keys -database=dm6 all.joiner cd /hive/data/genomes/dm6 makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev dm6 \ > downloads.log 2>&1 # real 28m50.409s # examine the goldenPath/*/README.txt files to verify the text # ready for pushQ entry (DONE - 2014-10-14 - Hiram) mkdir /hive/data/genomes/dm6/pushQ cd /hive/data/genomes/dm6/pushQ makePushQSql.pl dm6 > dm6.sql 2> stderr.out # real 1m52.294s # some errors are legitimate and OK: head stderr.out # WARNING: hgwdev does not have /gbdb/dm6/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/dm6/wib/quality.wib # WARNING: hgwdev does not have /gbdb/dm6/bbi/qualityBw/quality.bw # WARNING: dm6 does not have seq scp -p dm6.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta './bin/x86_64/hgsql qapushq < /tmp/dm6.sql' ######################################################################### # Phylogenetic tree from 27-way (DONE - 2014-11-13 - Hiram) mkdir /hive/data/genomes/dm6/bed/multiz27way/4d cd /hive/data/genomes/dm6/bed/multiz27way/4d # the split annotated maf's are in: ../anno/result/*.maf # using refGene for dm6, only transcribed genes and nothing # from the randoms and other misc. hgsql dm6 -Ne \ "select * from refGene WHERE cdsEnd > cdsStart;" | cut -f 2- \ | egrep -v "_random|chrUn" > refGene.gp # verify primary chroms only: cut -f2 refGene.gp | sort | uniq -c | sort -rn | less # 7213 chr3R # 6581 chr2L # 6167 chr2R # 5914 chr3L # 5577 chrX # 300 chr4 # 31 chrY genePredSingleCover refGene.gp stdout | sort > refGeneNP.gp wc -l *.gp # 31783 refGene.gp # 13955 refGeneNP.gp ssh ku mkdir /hive/data/genomes/dm6/bed/multiz27way/4d/run cd /hive/data/genomes/dm6/bed/multiz27way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/dm6/bed/multiz27way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/refGeneNR.gp | sed -e "s/\t$c\t/\tdm6.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/dm6/bed/multiz27way/anno/result/*.maf \ | egrep -v "_random|chrUn" | sed -e "s#.*multiz27way/anno/result/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # the tac puts the quick jobs at the front gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 8 of 8 jobs # CPU time in finished jobs: 502s 8.37m 0.14h 0.01d 0.000 y # IO & Wait Time: 39s 0.64m 0.01h 0.00d 0.000 y # Average job time: 68s 1.13m 0.02h 0.00d # Longest finished job: 118s 1.97m 0.03h 0.00d # Submission to last job: 125s 2.08m 0.03h 0.00d # combine mfa files ssh hgwdev cd /hive/data/genomes/dm6/bed/multiz27way/4d # one file result is useless: # -rw-rw-r-- 1 1 Nov 13 14:13 chrM.mfa # -rw-rw-r-- 1 48002 Nov 13 14:13 chrY.mfa # -rw-rw-r-- 1 207788 Nov 13 14:13 chr4.mfa # -rw-rw-r-- 1 3658928 Nov 13 14:14 chr2R.mfa # -rw-rw-r-- 1 3532514 Nov 13 14:14 chr2L.mfa # -rw-rw-r-- 1 3198092 Nov 13 14:14 chrX.mfa # -rw-rw-r-- 1 3670700 Nov 13 14:14 chr3L.mfa # -rw-rw-r-- 1 4577630 Nov 13 14:14 chr3R.mfa rm -f mfa/chrM.mfa #want comma-less species.list.txt /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list.txt`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa # >dm6 # >droSim1 # >droSec1 # >droYak3 # >droEre2 .... # >droAlb1 # >droGri2 # >musDom2 # >anoGam1 # >apiMel4 # >triCas2 # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../dm6.27way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh # tree-commas.nh: # (((((((((((((((((dm6, (droSim1, droSec1)), (droYak3, droEre2)), # (droBia2, droSuz1)), (droAna3, droBip2)), droEug2), droEle2), droKik2), # droTak2), droRho2), droFic2), ((droPse3, droPer1), droMir2)), droWil2), # (((droVir3, droMoj3), droAlb1), droGri2)), musDom2), anoGam1), apiMel4), # triCas2) # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree-commas.nh 4d.all.mfa # real 22m8.563s mv phyloFit.mod all.mod grep TREE all.mod # TREE: (((((((((((((((((dm6:0.0542029,(droSim1:0.0213103, # droSec1:0.0227973):0.0242621):0.0585014,(droYak3:0.0889658, # droEre2:0.0771526):0.0254381):0.181996,(droBia2:0.126739, # droSuz1:0.0934222):0.0964756):0.0125386,(droAna3:0.132676, # droBip2:0.134892):0.531515):0.00264138,droEug2:0.318569):0.0369765, # droEle2:0.207426):0.0025605,droKik2:0.485329):0.00243957, # droTak2:0.226746):0.0143511,droRho2:0.195624):0.0495906, # droFic2:0.257704):0.290027,((droPse3:0.0138526,droPer1:0.014979):0.00969914, # droMir2:0.0172853):0.398344):0.150107,droWil2:0.550935):0.0418275, # (((droVir3:0.242786,droMoj3:0.321598):0.090819,droAlb1:0.436249):0.0143777, # droGri2:0.318782):0.162991):0.222902,musDom2:0.772795):0.268197, # anoGam1:1.14709):0.387299,apiMel4:0.865834):0.588712,triCas2:0.588712); ############################################################################## # phastCons 27-way (DONE - 2014-11-27 - Hiram) # split 27way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/dm6/bed/multiz27way/cons/SS cd /hive/data/genomes/dm6/bed/multiz27way/cons/SS mkdir result done # XXX beware missed jobs in this procedure. The first time through this, # the chr3R was missed. It was caught up manually 2014-12-04 cat << '_EOF_' > mkSS.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/dm6/bed/multiz27way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/dm6/bed/multiz27way/cons/SS/result/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x mkSS.csh cat << '_EOF_' > template #LOOP mkSS.csh $(root1) {check out line+ done/$(root1)} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//" > maf.list gensub2 maf.list single template jobList para create jobList para try ... check ... etc # Completed: 276 of 277 jobs # CPU time in finished jobs: 896s 14.93m 0.25h 0.01d 0.000 y # IO & Wait Time: 16749s 279.16m 4.65h 0.19d 0.001 y # Average job time: 64s 1.07m 0.02h 0.00d # Longest finished job: 283s 4.72m 0.08h 0.00d # Submission to last job: 1017s 16.95m 0.28h 0.01d # one job was finished manually due to ku problems # Not all files have a result due to low alignment wc -l jobList # 277 jobList find ./result -type f | wc # 158 158 8397 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/dm6/bed/multiz27way/cons/run.cons cd /hive/data/genomes/dm6/bed/multiz27way/cons/run.cons # there are going to be only one phastCons run using # this same script. It triggers off of the current working directory # $cwd:t which is the "grp" in this script. It is: # all cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set cX = $1:r set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/dm6/bed/multiz27way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons set useGrp = "$grp.mod" ln -s $ssSrc/SS/result/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs find ../SS -type f | grep ".ss$" | sed -e 's/.ss$//' > ss.list wc -l ss.list # 158 ss.list # Create parasol batch and run it # run for all species mkdir /hive/data/genomes/dm6/bed/multiz27way/cons/all cd /hive/data/genomes/dm6/bed/multiz27way/cons/all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... para push # Completed: 158 of 158 jobs # CPU time in finished jobs: 1356s 22.59m 0.38h 0.02d 0.000 y # IO & Wait Time: 1391s 23.19m 0.39h 0.02d 0.000 y # Average job time: 17s 0.29m 0.00h 0.00d # Longest finished job: 228s 3.80m 0.06h 0.00d # Submission to last job: 900s 15.00m 0.25h 0.01d cd /hive/data/genomes/dm6/bed/multiz27way/cons/all find ./bed -type f | grep ".bed$" | xargs cat | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \ > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # load into database time hgLoadBed dm6 phastConsElements27way mostConserved.bed # Read 1427898 elements of size 5 from mostConserved.bed # real 0m13.345s # Read 1078721 elements of size 5 from mostConserved.bed # real 0m9.166s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between featureBits dm6 -enrichment refGene:cds phastConsElements27way # refGene:cds 16.046%, phastConsElements27way 51.675%, both 13.124%, # cover 81.79%, enrich 1.58x # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/dm6/bed/multiz27way/cons/all mkdir downloads time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phastCons27way.wigFix.gz) # real 1m12.361s # check integrity of data with wigToBigWig time (zcat downloads/phastCons27way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/dm6/chrom.sizes \ phastCons27way.bw) > bigWig.log 2>&1 & tail bigWig.log # pid=34798: VmPeak: 1054096 kB # real 1m8.421s bigWigInfo phastCons27way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 172,458,601 # primaryIndexSize: 3,804,252 # zoomLevels: 10 # chromCount: 151 # basesCovered: 117,294,910 # mean: 0.627817 # min: 0.000000 # max: 1.000000 # std: 0.446656 # encode those files into wiggle data time (zcat downloads/phastCons27way.wigFix.gz \ | wigEncode stdin phastCons27way.wig phastCons27way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 0m25.328s du -hsc *.wi? # 112M phastCons27way.wib # 11M phastCons27way.wig # 123M total # Load gbdb and database with wiggle. ln -s `pwd`/phastCons27way.wib /gbdb/dm6/multiz27way/phastCons27way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/dm6/multiz27way \ dm6 phastCons27way phastCons27way.wig # real 0m0.854s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh dm6 phastCons27way # db.table min max mean count sumData stdDev viewLimits # dm6.phastCons27way 0 1 0.618592 89437277 5.53252e+07 0.448759 # viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=dm6 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons27way > histogram.data 2>&1 # real 0m5.630s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " D. melanogaster dm6 Histogram phastCons27way track" set xlabel " phastCons27way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ############################################################################# # phyloP for 27-way (DONE - 2014-11-13 - Hiram) # run phyloP with score=LRT ssh ku mkdir /cluster/data/dm6/bed/multiz27way/consPhyloP cd /cluster/data/dm6/bed/multiz27way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.662 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.662 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.169000 0.331000 0.331000 0.169000 # following the pattern from rn5 with grp: "all" (no other grp) cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set file1 = $f:t set out = $2 set cName = $f:t:r set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/dm6/bed/multiz27way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "/hive/data/genomes/dm6/bed/multiz27way/cons/SS/result/$f" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$file1.wigFix $out rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$cName rmdir --ignore-fail-on-non-empty $cons/tmp/$grp rmdir --ignore-fail-on-non-empty $cons/tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 162 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/dm6/bed/multiz27way/consPhyloP/all cd /hive/data/genomes/dm6/bed/multiz27way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # the -ram=8g will allow only one job per node to slow this down since # it would run too fast otherwise. Either run on one of the small # klusters or use the -ram=8g on the para create para -ram=8g create jobList para try ... check ... etc ... para -maxJob=64 push para time > run.time # Completed: 158 of 158 jobs # CPU time in finished jobs: 45635s 760.58m 12.68h 0.53d 0.001 y # IO & Wait Time: 2235s 37.26m 0.62h 0.03d 0.000 y # Average job time: 303s 5.05m 0.08h 0.00d # Longest finished job: 6072s 101.20m 1.69h 0.07d # Submission to last job: 6125s 102.08m 1.70h 0.07d # make downloads mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP27way.wigFix.gz) # real 2m6.787s # check integrity of data with wigToBigWig time (zcat downloads/phyloP27way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/dm6/chrom.sizes \ phyloP27way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=523: VmPeak: 1372692 kB # real 1m13.231s bigWigInfo phyloP27way.bw # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 328,237,985 # primaryIndexSize: 3,804,252 # zoomLevels: 10 # chromCount: 151 # basesCovered: 117,294,910 # mean: 1.206203 # min: -8.216000 # max: 7.073000 # std: 1.669809 # encode those files into wiggle data time (zcat downloads/phyloP27way.wigFix.gz \ | wigEncode stdin phyloP27way.wig phyloP27way.wib) # Converted stdin, upper limit 7.07, lower limit -8.22 # real 0m35.814s du -hsc *.wi? # 112M phyloP27way.wib # 12M phyloP27way.wig # 124M total # Load gbdb and database with wiggle. ln -s `pwd`/phyloP27way.wib /gbdb/dm6/multiz27way/phyloP27way.wib hgLoadWiggle -pathPrefix=/gbdb/dm6/multiz27way dm6 \ phyloP27way phyloP27way.wig # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh dm6 phyloP27way # db.table min max mean count sumData stdDev viewLimits # dm6.phyloP27way -8.216 7.073 1.2062 117294910 1.41482e+08 1.66981 # viewLimits=-7.14284:7.073 calc 8.216\+7.7073 # 8.216+7.7073 = 15.923300 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram -db=dm6 \ -hBinSize=0.0161 -hBinCount=1000 -hMinVal=-8.3 -verbose=2 \ phyloP27way > histogram.data 2>&1 # real 0m7.284s # find out the range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin # Q1 0.000012 # median 0.000255 # Q3 0.001728 # average 0.001134 # min 0.000000 # max 0.009261 # count 882 # total 0.999997 # standard deviation 0.001655 # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " D. melanogaster dm6 Histogram phyloP27way track" set xlabel " phyloP27way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.007] set xrange [-3:6] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # Construct download files for 27-way (DONE - 2014-11-20 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/multiz27way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/multiz27way/maf mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/multiz27way/alignments mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phastCons27way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phastCons27way/dm6.27way.phastCons mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phyloP27way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phyloP27way/dm6.27way.phyloP27way mkdir /hive/data/genomes/dm6/bed/multiz27way/downloads cd /hive/data/genomes/dm6/bed/multiz27way/downloads mkdir multiz27way phastCons27way phyloP27way cd multiz27way mkdir maf alignments cd maf time rsync -a -P ../../../anno/result/ ./ # real xxx time gzip *.maf # real real 8m46.494s time md5sum *.maf.gz > md5sum.txt # real 0m6.206s ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/multiz27way/maf cd .. du -hsc maf ../../anno/result/ # 1.1G maf # 7.4G ../../anno/result/ # 8.5G total grep TREE ../../4d/all.mod | sed -e 's/TREE: //' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > dm6.27way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh dm6.27way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > dm6.27way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh dm6.27way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > dm6.27way.scientificNames.nh md5sum *.gz *.nh > md5sum.txt ln -s `pwd`/*.nh `pwd`/*.txt `pwd`/*.maf.gz \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/multiz27way ##################################################################### cd /hive/data/genomes/dm6/bed/multiz27way/downloads/phastCons27way ln -s ../../cons/all/downloads/phastCons27way.wigFix.gz . ln -s ../../cons/all/all.mod dm6.27way.phastCons.mod ln -s ../../cons/all/phastCons27way.bw dm6.27way.phastCons.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m4.631s ln -s `pwd`/*.gz `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phastCons27way/ # obtain the README.txt from hg38/phastCons7way and update for this # situation, from: # /hive/data/genomes/hg38/bed/multiz7way/downloads/phastCons7way/README.txt ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phastCons27way ##################################################################### cd /hive/data/genomes/dm6/bed/multiz27way/downloads/phyloP27way mkdir dm6.27way.phyloP27way cd dm6.27way.phyloP27way ln -s ../../../consPhyloP/all/downloads/*.gz . time md5sum *.gz > md5sum.txt & # real 3m37.827s ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phyloP27way/dm6.27way.phyloP27way cd .. ln -s ../../consPhyloP/run.phyloP/all.mod dm6.27way.phyloP27way.mod ln -s ../../consPhyloP/all/phyloP27way.bw dm6.27way.phyloP27way.bw time md5sum *.mod *.bw > md5sum.txt & # real 4m44.724s # obtain the README.txt from hg38/phyloP7way and update for this # situation, from: # /hive/data/genomes/hg38/bed/multiz7way/downloads/phyloP7way/README.txt ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/dm6/phyloP27way ############################################################################ # creating upstream mafs (DONE - 2014-11-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/dm6/bed/multiz27way/downloads/multiz27way cd /hive/data/genomes/dm6/bed/multiz27way/downloads/multiz27way # run this bash script cat << '_EOF_' > mkUpstream.sh #!/bin/sh DB=dm6 GENE=refGene NWAY=multiz27way export DB GENE for S in 1000 2000 5000 do echo "making upstream${S}.${GENE}.maf" echo "featureBits ${DB} ${GENE}:upstream:${S} -fa=/dev/null -bed=stdout" featureBits ${DB} ${GENE}:upstreamAll:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | mafFrags ${DB} ${NWAY} stdin stdout \ -orgs=/hive/data/genomes/${DB}/bed/${NWAY}/species.list.txt \ | gzip -c > upstream${S}.${GENE}.maf.gz echo "done upstream${S}.${GENE}.maf.gz" done '_EOF_' # << happy emacs chmod +x mkUpstream.sh time ./mkUpstream.sh # real 69m25.852s # FIXUP/VERIFY the genbank.conf file to indicate this gene choice for the # upstream automatic generation ~/kent/src/hg/makeDb/genbank/etc/genbank.conf # dm6.upstreamGeneTbl = refGene # dm6.upstreamMaf = multiz27way /hive/data/genomes/dm6/bed/multiz27way/species.list.txt git commit -m "dm6 GeneTbl upstreamMaf set to 27-way refs #4049" genbank.conf git push make etc-update ############################################################################# # hgPal downloads (DONE - 2014-11-20 - Hiram) # FASTA from 27-way for refGene ssh hgwdev screen -S dm6HgPal mkdir /hive/data/genomes/dm6/bed/multiz27way/pal cd /hive/data/genomes/dm6/bed/multiz27way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz27way export gp=refGene export db=dm6 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time ./$gp.jobs > $gp.jobs.log 2>&1 # real 20m2.909s time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m32.720s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 5m53.612s # we're only distributing exons at this time export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # make refGenePfam table (Done - 2016-09-19 - Brian Raney) # Pfam in Refseq Genes export dir=/cluster/data/dm6/bed/pfam mkdir -p $dir cd $dir rm -rf splitProt mkdir splitProt refSeqGet -protSeqs=refSeq.faa dm6 #20090154 bases (15768554 N's 4321600 real 4321600 upper 0 lower) in 30430 sequences in 1 files refSeqGet -rnaSeqs=refSeq.fa dm6 hgsql dm6 -Ne "select * from refSeqAli" | cut -f 2- > refGene.psl faSplit sequence refSeq.faa 10000 splitProt/ mkdir -p result ls -1 splitProt > prot.list # /hive/data/outside/pfam/hmmpfam -E 0.1 /hive/data/outside/pfam/current/Pfam_fs \ cat << '_EOF_' > doPfam #!/bin/csh -ef /hive/data/outside/pfam/Pfam29.0/PfamScan/hmmer-3.1b2-linux-intel-x86_64/binaries/hmmsearch --domtblout /scratch/tmp/pfam.$2.pf --noali -o /dev/null -E 0.1 /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm splitProt/$1 mv /scratch/tmp/pfam.$2.pf $3 _EOF_ # << happy emacs chmod +x doPfam cat << '_EOF_' > template #LOOP doPfam $(path1) $(root1) {check out line+ result/$(root1).pf} #ENDLOOP _EOF_ gensub2 prot.list single template jobList ssh ku "cd $dir; para make jobList" ssh ku "cd $dir; para time > run.time" cat run.time # Completed: 9690 of 9690 jobs # CPU time in finished jobs: 2667749s 44462.49m 741.04h 30.88d 0.085 # y # IO & Wait Time: 148194s 2469.89m 41.16h 1.72d 0.005 # y # Average job time: 291s 4.84m 0.08h 0.00d # Longest finished job: 511s 8.52m 0.14h 0.01d # Submission to last job: 8704s 145.07m 2.42h 0.10d # # Make up pfamDesc.tab by converting pfam to a ra file first cat << '_EOF_' > makePfamRa.awk /^NAME/ {print} /^ACC/ {print} /^DESC/ {print} /^TC/ {print $1,$3; printf("\n");} _EOF_ awk -f makePfamRa.awk /hive/data/outside/pfam/Pfam29.0/Pfam-A.hmm > pfamDesc.ra raToTab -cols=ACC,NAME,DESC,TC pfamDesc.ra stdout | awk -F '\t' '{printf("%s\t%s\t%s\t%g\n", $1, $2, $3, $4);}' | sort > pfamDesc.tab # Convert output to tab-separated file. cd $dir catDir result | sed '/^#/d' > allResults.tab awk 'BEGIN {OFS="\t"} { print $5,$1,$18-1,$19,$4,$14}' allResults.tab | sort > allUcscPfam.tab join -t $'\t' -j 1 allUcscPfam.tab pfamDesc.tab | tawk '{if ($6 > $9) print $2, $3, $4, $5, $6, $1}' > ucscPfam.tab cd $dir cat refSeq.faa | sed 's/\.[0-9]*//' > refSeq.noVers.faa cat refSeq.fa | sed 's/\.[0-9]*//' > refSeq.noVers.fa sort ucscPfam.tab | sed 's/\.[0-9]*//' > sortPfam.tab faCount refSeq.noVers.faa | awk '{print $1, $2}' > protein.sizes hgsql dm6 -Ne "select protAcc, mrnaAcc from refLink, refGene where mrnaAcc=refGene.name" | awk '{if (NF == 2) print}' |sort > sortMap.txt while read prot mrna do echo ./doBlat $prot $mrna done > blat.jobs < sortMap.txt ssh ku "cd $dir; para make blat.jobs" cat results/* > proteinToMrna.psl tawk '{print $10, $16, $17}' proteinToMrna.psl | sort > sortCdsOut.tab awk '{print $10, $11}' refGene.psl > gene.sizes join sortCdsOut.tab sortPfam.tab | join /dev/stdin sortMap.txt | awk '{print $9, $2 + 3 * $4, $2 + 3 * $5, $6}' | bedToPsl gene.sizes stdin domainToGene.psl pslMap domainToGene.psl refGene.psl stdout | pslToBed stdin stdout | bedOrBlocks -useName stdin domainToGenome.bed hgLoadBed dm6 refGenePfam domainToGenome.bed ############################################################################## # Crispr track. See ../crisprTrack/README.txt (2016-09-15 max) # Command: doCrispr.sh dm6 ensGene ############################################################################## # LASTZ A. gambiae AnoGam3 (DONE 2017-12-20 - Hiram) screen -S anoGam3-dm6 mkdir /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20 cd /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20 printf '# D. melanogaster vs. A. gambiae BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - A. gambiae SEQ2_DIR=/hive/data/genomes/anoGam3/anoGam3.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/anoGam3/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 8m50.927s cat fb.dm6.chainAnoGam3Link.txt # 19641460 bases of 142573024 (13.776%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev \ -buildDir=`pwd` dm6 anoGam3) > rBest.log 2>&1 & # real 44m34.312s cat fb.dm6.chainRBestAnoGam3Link.txt # 15964959 bases of 142573024 (11.198%) in intersection # running the swap mkdir /hive/data/genomes/anoGam3/bed/blastz.dm6.swap cd /hive/data/genomes/anoGam3/bed/blastz.dm6.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzAnoGam3.2017-12-20/DEF \ -swap -syntenicNet -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > swap.log 2>&1 # real 2m0.901s cat fb.anoGam3.chainDm6Link.txt # 20168390 bases of 264424304 (7.627%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev \ -buildDir=`pwd` anoGam3 dm6) > rBest.log 2>&1 & # real 46m7.441s cat fb.anoGam3.chainRBestDm6Link.txt # 16040573 bases of 264424304 (6.066%) in intersection ########################################################################### # LASTZ D. simulans DroSim2 (DONE 2018-09-12 - Hiram) screen -S droSim2-dm6 mkdir /hive/data/genomes/dm6/bed/lastzDroSim2.2018-09-12 cd /hive/data/genomes/dm6/bed/lastzDroSim2.2018-09-12 cat << '_EOF_' > DEF printf '# D. melanogaster vs. D. simulans BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz BLASTZ_H=2000 BLASTZ_Y=3400 BLASTZ_L=4000 BLASTZ_K=2200 BLASTZ_Q=/cluster/data/blastz/HoxD55.q # TARGET - D. melanogaster SEQ1_DIR=/hive/data/genomes/dm6/dm6.2bit SEQ1_CHUNK=5000000 SEQ1_LAP=10000 SEQ1_LEN=/hive/data/genomes/dm6/chrom.sizes # QUERY - D. simulans SEQ2_DIR=/hive/data/genomes/droSim2/droSim2.2bit SEQ2_CHUNK=5000000 SEQ2_LAP=10000 SEQ2_LEN=/hive/data/genomes/droSim2/chrom.sizes BASE=/hive/data/genomes/dm6/bed/lastzDroSim2.2018-09-12 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=loose \ -syntenicNet -fileServer=hgwdev \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 39m35.348s cat fb.dm6.chainDroSim2Link.txt # 117723253 bases of 142573024 (82.570%) in intersection cat fb.dm6.chainSynDroSim2Link.txt # 108599452 bases of 142573024 (76.171%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` dm6 droSim2) > rbest.log 2>&1 & # real 6m47.792s cat fb.dm6.chainRBest.DroSim2.txt # 110510644 bases of 142573024 (77.512%) in intersection # running the swap mkdir /hive/data/genomes/droSim2/bed/blastz.dm6.swap cd /hive/data/genomes/droSim2/bed/blastz.dm6.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/dm6/bed/lastzDroSim2.2018-09-12/DEF \ -swap -chainMinScore=3000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 4m32.910s cat fb.droSim2.chainDm6Link.txt # 115776543 bases of 124963774 (92.648%) in intersection cat fb.droSim2.chainSynDm6Link.txt # 108004181 bases of 124963774 (86.428%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` droSim2 dm6) > rbest.log 2>&1 & # real 7m22.120s cat fb.droSim2.chainRBest.Dm6.txt # 110784245 bases of 124963774 (88.653%) in intersection ########################################################################### + +# ReMap refs #28960 (2022-04-13 Gerardo) +cd /hive/data/genomes/dm6/bed +mkdir reMap +cd reMap +wget https://remap.univ-amu.fr/storage/public/hubReMap2022/dm6/bigBed/remap2022_all_macs2_dm6_v1_0.bb +mv remap2022_all_macs2_dm6_v1_0.bb reMap2022.bb +wget https://remap.univ-amu.fr/storage/public/hubReMap2022/dm6/bigBed/dm6.bw +mv dm6.bw reMapDensity2022.bw +cd /gbdb/dm6 +mkdir reMap +cd reMap +ln -s /hive/data/genomes/dm6/bed/reMap/reMap2022.bb +ln -s /hive/data/genomes/dm6/bed/reMap/reMapDensity2022.bw +cd ~/kent/src/hg/makeDb/trackDb/drosophila/dm6/ +wget https://remap.univ-amu.fr/storage/public/hubReMap2022/remapHub2022.html +cp remapHub2022.html reMap.html +cd ~/kent/src/hg/makeDb/trackDb +curl https://remap.univ-amu.fr/storage/public/hubReMap2022/dm6/trackDb.txt > drosophila/dm6/reMap.ra +vi drosophila/dm6/reMap.ra +vi drosophila/dm6/trackDb.ra +vi drosophila/dm6/reMap.html