GRC information # http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/mouse/ # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/13767 # C57BL/6J sequence: # http://www.ncbi.nlm.nih.gov/bioproject/51977 # Finishing project: # http://www.ncbi.nlm.nih.gov/bioproject/20689 # Assembly ID: 327618 # http://www.ncbi.nlm.nih.gov/genome/assembly/327618/ # Celera Assembly # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAHY00 ############################################################################# # fetch sequence from genbank (DONE - 2012-01-30 - Hiram) mkdir -p /hive/data/genomes/mm10/genbank cd /hive/data/genomes/mm10/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/ ./ # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \ non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 lower) # in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (gi|371559559|gb|JH584295.1|) max 195471971 # (gi|371561115|gb|CM000994.2|) median 184189 ############################################################################# # fixup names for UCSC standards (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/ucsc cd /hive/data/genomes/mm10/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 0m53.256s faSize chr*.fa # 2725521370 bases (77999939 N's 2647521431 real 2647521431 upper 0 # lower) in 21 sequences in 21 files # Total size: mean 129786731.9 sd 33408399.1 min 61431566 (chr19) # max 195471971 (chr1) median 124902244 ######################## Unplaced scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m0.119s # make sure none of the names got to be over 31 characers long: grep -v "^#" unplaced.agp | cut -f1 | sort | uniq -c | sort -rn # not much in that sequence: faSize unplaced.fa # 803895 bases (62411 N's 741484 real 741484 upper 0 lower) # in 22 sequences in 1 files # Total size: mean 36540.7 sd 21518.0 min 20208 (chrUn_GL456368) # max 114452 (chrUn_JH584304) median 28772 ########## chrM zcat ../genbank/non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz \ | sed -e "s/^>.*/>chrM/" > chrM.fa zcat ../genbank/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | sed -e "s/^AY172335.1/chrM/" > chrM.agp ######################## Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m0.430s faSize chr*_random.fa # 4530210 bases (25924 N's 4504286 real 4504286 upper 0 lower) # in 22 sequences in 6 files # Total size: mean 205918.6 sd 184688.0 min 1976 (chr4_JH584295_random) # max 953012 (chr5_JH584299_random) median 191905 # verify none of the names are longer than 31 characters: grep -h -v "^#" chr*_random.agp | cut -f1 | sort | uniq -c | sort -nr # compress all these fasta and agp files: gzip *.fa *.agp # verify all the sequence is still here after all this rigamarole: time faSize *.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 # lower) in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 ############################################################################# # Initial browser build (DONE - 2012-01-06 - Hiram) cd /hive/data/genomes/mm10 cat << '_EOF_' > mm10.config.ra # Config parameters for makeGenomeDb.pl: db mm10 clade mammal genomeCladePriority 40 scientificName Mus musculus commonName Mouse assemblyDate Dec. 2011 assemblyLabel Genome Reference Consortium Mouse Build 38 (GCA_000001635.2) assemblyShortLabel GRCm38 orderKey 1209 mitoAcc none fastaFiles /hive/data/genomes/mm10/ucsc/*.fa.gz agpFiles /hive/data/genomes/mm10/ucsc/*.agp.gz dbDbSpeciesDir mouse taxId 10090 ncbiAssemblyId 327618 ncbiAssemblyName GRCm38 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp mm10.config.ra > agp.log 2>&1 # real 3m4.568s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db mm10.config.ra > db.log 2>&1 # real 20m51.374s # verify the end of db.log indicates successful ############################################################################# # running repeat masker (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/repeatMasker cd /hive/data/genomes/mm10/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek mm10 > do.log 2>&1 & # real 609m48.767s cat faSize.rmsk.txt # 2730871774 bases (78088274 N's 2652783500 real 1456094545 upper # 1196688955 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.82 masked total, %45.11 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/simpleRepeat cd /hive/data/genomes/mm10/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ mm10 > do.log 2>&1 & # real 16m35.603s # batch failed, one job failed: # ./TrfRun.csh /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # which is the chrM sequence - it has no simple repeats # create an empty output file result: touch /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # go to encodek and create the run.time file to signal this step is done cd /hive/data/genomes/mm10/bed/simpleRepeat/run.cluster para time > run.time # Completed: 70 of 71 jobs # Crashed: 1 jobs # CPU time in finished jobs: 13103s 218.38m 3.64h 0.15d 0.000 y # IO & Wait Time: 163s 2.72m 0.05h 0.00d 0.000 y # Average job time: 190s 3.16m 0.05h 0.00d # Longest finished job: 392s 6.53m 0.11h 0.00d # Submission to last job: 894s 14.90m 0.25h 0.01d # continue procedure: time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ -continue=filter mm10 > filter.log 2>&1 & # real 1m20.021s cat fb.simpleRepeat # 92161833 bases of 2652783500 (3.474%) in intersection # when RepeatMasker is done, add this mask to the sequence: cd /hive/data/genomes/mm10 twoBitMask mm10.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa mm10.2bit stdout | faSize stdin > faSize.mm10.2bit.txt cat faSize.mm10.2bit.txt # 2730871774 bases (78088274 N's 2652783500 real 1454267808 upper # 1198515692 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.89 masked total, %45.18 masked real # set SymLink in gbdb to this masked sequence rm /gbdb/mm10/mm10.2bit ln -s `pwd`/mm10.2bit /gbdb/mm10/mm10.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/gap cd /hive/data/genomes/mm10/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../mm10.unmasked.2bit > findMotif.txt 2>&1 # real 1m0.372s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps mm10 -not gap -bed=notGap.bed # 2658879040 bases of 2730871774 (97.364%) in intersection # real 0m13.067s time featureBits -countGaps mm10 allGaps.bed notGap.bed -bed=new.gaps.bed # 6095540 bases of 2730871774 (0.223%) in intersection # real 0m15.177s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1 # 54 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1`; chomp $ix; open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed"; while (my $line = <FH>) { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 384 featureBits -countGaps mm10 other.bed # 6095540 bases of 2730871774 (0.223%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad mm10 otherGap other.bed # verify no overlap with gap table: time featureBits -countGaps mm10 gap other.bed # 0 bases of 2730871774 (0.000%) in intersection # real 0m1.281s # verify no errors before adding to the table: time gapToLift -minGap=1 mm10 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 & # real 0m7.205s # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 1633 Jan 6 15:20 before.gapToLift.txt # it indicates that there are telomere's adjacent to centromere's # and heterochromatin # starting with this many: hgsql -e "select count(*) from gap;" mm10 # 302 hgsql mm10 -e 'load data/genomes local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" mm10 # 686 # == 302 + 384 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 mm10 nonBridged.lift -bedFile=nonBridged.bed #same set of warnings as before, telomere's centromere's and heterochromatin # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/windowMasker cd /hive/data/genomes/mm10/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev mm10 > do.log 2>&1 & # real 167m12.012s # Masking statistics twoBitToFa mm10.wmsk.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1686407708 upper # 966375792 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.39 masked total, %36.43 masked real twoBitToFa mm10.wmsk.sdust.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1670424648 upper # 982358852 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.97 masked total, %37.03 masked real hgLoadBed mm10 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 12655947 elements of size 3 featureBits -countGaps mm10 windowmaskerSdust # 1060447084 bases of 2730871774 (38.832%) in intersection # eliminate the gaps from the masking featureBits mm10 -not gap -bed=notGap.bed # 2652783500 bases of 2652783500 (100.000%) in intersection time nice -n +19 featureBits mm10 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 982358852 bases of 2652783500 (37.031%) in intersection # real 1m42.449s # reload track to get it clean hgLoadBed mm10 windowmaskerSdust cleanWMask.bed.gz # Loaded 12655987 elements of size 4 time featureBits -countGaps mm10 windowmaskerSdust # 982358852 bases of 2730871774 (35.972%) in intersection # real 1m13.889s # do *not* need to mask with this clean result since RepeatMasker # does a very good job here. Using RM masking instead. zcat cleanWMask.bed.gz \ | twoBitMask ../../mm10.unmasked.2bit stdin \ -type=.bed mm10.cleanWMSdust.2bit twoBitToFa mm10.cleanWMSdust.2bit stdout | faSize stdin \ > mm10.cleanWMSdust.faSize.txt cat mm10.cleanWMSdust.faSize.txt # how much does this window masker and repeat masker overlap: time featureBits -countGaps mm10 rmsk windowmaskerSdust # 753614881 bases of 2730871774 (27.596%) in intersection # real 1m42.691s # RM by itself: time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2012-02-07 - Hiram) ssh encodek mkdir /hive/data/genomes/mm10/bed/linSpecRep cd /hive/data/genomes/mm10/bed/linSpecRep # split the RM output by chromosome name into separate files mkdir rmsk dateRepeats head -3 ../repeatMasker/mm10.sorted.fa.out > rmsk.header.txt headRest 3 ../repeatMasker/mm10.sorted.fa.out \ | splitFileByColumn -ending=.out -col=5 -head=rmsk.header.txt stdin rmsk ls -1S rmsk/* > rmOut.list cat << '_EOF_' > mkLSR #!/bin/csh -fe rm -f dateRepeats/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus /scratch/data/genomes/RepeatMasker/DateRepeats \ $1 -query mouse -comp human -comp rat -comp dog -comp cow mv $1_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus dateRepeats '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ dateRepeats/$(file1)_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus} #ENDLOOP '_EOF_' # << happy emacs gensub2 rmOut.list single template jobList para create jobList para try ... check ... push ... etc... para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 1743s 29.05m 0.48h 0.02d 0.000 y # IO & Wait Time: 190s 3.16m 0.05h 0.00d 0.000 y # Average job time: 29s 0.49m 0.01h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 160s 2.67m 0.04h 0.00d mkdir notInHuman notInRat notInDog notInCow for F in dateRepeats/chr*.out_homo-sapiens* do B=`basename ${F}` B=${B/.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec done # notInDog, and notInCow ended up being identical. # The notInRat and notInHuman are different # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 > check.same # this produces a count of 2 for the sums for Cow and Dog, all the same egrep "Cow|Dog" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # this does not produce a count of 2 for the sums for Cow and Human egrep "Cow|Human" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # Copy to data/genomes staging for cluster replication mkdir /hive/data/genomes/staging/data/genomes/mm10 rsync -a -P ./notInRat/ /hive/data/genomes/staging/data/genomes/mm10/notInRat/ rsync -a -P ./notInHuman/ /hive/data/genomes/staging/data/genomes/mm10/notInHuman/ rsync -a -P ./notInCow/ /hive/data/genomes/staging/data/genomes/mm10/notInOthers/ # We also need the nibs for the lastz runs with lineage specific repeats mkdir /hive/data/genomes/mm10/nib cd /hive/data/genomes/mm10 cut -f1 chrom.sizes | while read C do twoBitToFa -seq=${C} mm10.2bit stdout | faToNib -softMask stdin nib/${C}.nib ls -og nib/$C.nib done # verify one is properly masked: nibFrag -masked nib/chrM.nib 0 16299 + stdout | less # compare to: twoBitToFa -seq=chrM mm10.fa stdout | less # Copy to data/genomes staging for cluster replication rsync -a -P ./nib/ /hive/data/genomes/staging/data/genomes/mm10/nib/ ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-02-08 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2652783500 / 2897316137 \) \* 1024 # ( 2652783500 / 2897316137 ) * 1024 = 937.574699 # round up to 1000 (mm9 used 912) cd /hive/data/genomes/mm10 time blat mm10.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/mm10.11.ooc -repMatch=1000 # Wrote 27208 overused 11-mers to jkStuff/mm10.11.ooc # real 2m9.568s # at repMatch=900: # Wrote 31822 overused 11-mers to jkStuff/mm10.11.ooc # there are non-bridged gaps, make lift file for genbank hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes cd /hive/data/genomes/mm10/jkStuff gapToLift mm10 mm10.nonBridged.lift -bedFile=mm10.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' mm10.nonBridged.bed | sort -nr | head 116378660 chr2 59120641 175499301 chr2.02 # copy all of this stuff to the klusters: cd /hive/data/genomes/mm10 mkdir /hive/data/genomes/staging/data/genomes/mm10 cp -p jkStuff/mm10.11.ooc jkStuff/mm10.nonBridged.lift chrom.sizes \ mm10.2bit /hive/data/genomes/staging/data/genomes/mm10 # request rsync copy from cluster admin ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram) # examine the file: /cluster/data/genomes/genbank/data/genomes/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mus musculus 334577 4853663 26288 # to decide which "native" mrna or ests you want to specify in genbank.conf # of course, mm10 has plenty of everything ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add mm10 just after mm9 and commit to GIT # mm10 mm10.serverGenome = /hive/data/genomes/mm10/mm10.2bit mm10.clusterGenome = /scratch/data/genomes/mm10/mm10.2bit mm10.ooc = /scratch/data/genomes/mm10/mm10.11.ooc mm10.align.unplacedChroms = chr* mm10.lift = /scratch/data/genomes/mm10/mm10.nonBridged.lift mm10.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} mm10.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} mm10.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} mm10.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} mm10.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} mm10.downloadDir = mm10 mm10.refseq.mrna.xeno.load = yes mm10.refseq.mrna.xeno.loadDesc = yes mm10.mgc = yes mm10.genbank.mrna.blatTargetDb = yes # mm10.ccds.ncbiBuild = 37.2 # mm10.upstreamGeneTbl = refGene # mm10.upstreamMaf = multiz30way # /hive/data/genomes/mm10/bed/multiz30way/species.list # end of section added to etc/genbank.conf git commit -m "adding mm10 definitions" genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbAlignStep -initial mm10 & # var/build/logs/2012.02.08-11:38:50.mm10.initalign.log # real 795m52.388s # load data/genomesbase when finished ssh hgwdev cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm10 & # logFile: var/dbload/hgwdev/logs/2012.02.09-10:05:25.dbload.log # real 114m56.461s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add mm10 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added mm10." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # running cpgIsland business (DONE - 2012-02-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/cpgIsland cd /hive/data/genomes/mm10/bed/cpgIsland # use a previous binary for this program ln -s ../../../mm9/bed/cpgIsland/hg3rdParty/cpgIslands/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/mm10/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck. cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep -v "^#seq|^total" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 191s 3.19m 0.05h 0.00d 0.000 y # IO & Wait Time: 189s 3.14m 0.05h 0.00d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 51s 0.85m 0.01h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | awk '{print length($0)}' | sort -rn | head -1 # 20 # update the length 14 in the template to be 16: sed -e "s/14/20/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/mm10/bed/cpgIsland hgLoadBed mm10 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Loaded 16023 elements of size 10 featureBits mm10 cpgIslandExt # 10495450 bases of 2652783500 (0.396%) in intersection # compare to previous: featureBits mm9 cpgIslandExt # 10496250 bases of 2620346127 (0.401%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt mm10 # cleanup, unless you want to move them to the genscan procedure below rm -fr hardMaskedFa ######################################################################### # GENSCAN GENE PREDICTIONS (DONE - 2012-02-09,10 - Hiram) mkdir /hive/data/genomes/mm10/bed/genscan cd /hive/data/genomes/mm10/bed/genscan # use a previously existing genscan binary ln -s ../../../mm9/bed/genscan/hg3rdParty . # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 66 genome.list # Run on small cluster (more mem than big cluster). ssh encodek cd /hive/data/genomes/mm10/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Crashed: 2 jobs # CPU time in finished jobs: 171336s 2855.60m 47.59h 1.98d 0.005 y # IO & Wait Time: 261s 4.35m 0.07h 0.00d 0.000 y # Average job time: 2640s 44.00m 0.73h 0.03d # Longest finished job: 22618s 376.97m 6.28h 0.26d # Submission to last job: 28682s 478.03m 7.97h 0.33d # one of the two crashed jobs was just a stray line in the jobList, # somehow a line with the string: '_EOF_' got in there. # as with mm9, chr7 did not work. Break it up into pieces mkdir /hive/data/genomes/mm10/bed/genscan/chr7Split cd /hive/data/genomes/mm10/bed/genscan/chr7Split grep chr7 ../../../jkStuff/mm10.nonBridged.lift | grep -v random \ > chr7.nonBridged.lift faToTwoBit ../hardMaskedFa/chr7.fa chr7.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr7.2bit chr7.nonBridged.lift \ | sed -e "s/chr7./chr7_/" > chr7.nonBridged.fa faSplit sequence chr7.nonBridged.fa 100 split7/chr7_ ln -s ../../../../mm9/bed/genscan/hg3rdParty . echo '#!/bin/sh' > cmdList.sh ls split7 | while read F do echo "/cluster/bin/x86_64/gsBig split7/${F} gtf/${F}.gtf} -trans=pep/${F}.pep} -subopt=subopt/${F}.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 &" done >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh mkdir gtf pep subopt time ./cmdList.sh > run.log 2>&1 # about 20 minutes # fix the names in the lift file cat chr7.nonBridged.lift | sed -e "s/chr7./chr7_/" > chr7.lift # the sed mangling will provide unique names for them all, but they # will not be in the strict numerical order that genscan usually produces cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.gtf cat subopt/chr7_*.bed | liftUp -type=.bed stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.subopt.bed cat pep/chr7_*.pep | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.pep cp -p chr7.pep ../pep cp -p chr7.gtf ../gtf cp -p chr7.subopt.bed ../subopt/chr7.bed find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/mm10/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the data/genomesbase (without -genePredExt because no frame info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/mm10/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf mm10 genscan genscan.gtf # Read 45012 transcripts in 323529 lines in 1 files # 45012 groups 59 seqs 1 sources 1 feature types # 45012 gene predictions hgLoadBed mm10 genscanSubopt genscanSubopt.bed # Read 526572 elements of size 6 from genscanSubopt.bed featureBits mm10 genscan # 55743040 bases of 2652783500 (2.101%) in intersection # previously: featureBits mm9 genscan # 55293837 bases of 2620346127 (2.110%) in intersection ######################################################################### # CREATE MICROSAT TRACK (DONE - 2012-02-09 - Hiram ssh hgwdev mkdir /cluster/data/genomes/mm10/bed/microsat cd /cluster/data/genomes/mm10/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed mm10 microsat microsat.bed # Read 197237 elements of size 4 from microsat.bed ######################################################################### # BLATSERVERS ENTRY (DONE - 2012-02-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17832", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17833", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # set default position the same as was mm9 via blat # (DONE - 2012-02-09 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr12:56694976-56714605" where name="mm10";' \ hgcentraltest ############################################################################ # constructing downloads (DONE - 2012-02-09 - Hiram) cd /hive/data/genomes/mm10 # some of the smaller bits are missing the simple repeat results time makeDownloads.pl -allowMissedTrfs -workhorse=hgwdev mm10 # real 41m42.408s # edit the README files in goldenPath/*/README.txt ######################################################################### # create pushQ entry (DONE - 2012-02-09 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -data/genomesbase=mm10 -keys all.joiner mkdir /hive/data/genomes/mm10/pushQ cd /hive/data/genomes/mm10/pushQ makePushQSql.pl mm10 > mm10.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/mm10/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/mm10/wib/quality.wib # WARNING: hgwdev does not have /gbdb/mm10/bbi/quality.bw # WARNING: mm10 does not have seq # WARNING: mm10 does not have extFile # *** All done! # which are not real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # list of tables will be in the output # put them in manually after loading the pushQ entry scp -p mm10.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < mm10.sql ######################################################################### # lifting ensGene track from mm9 (DONE - 2012-02-22 - Hiram) # no gene tracks yet on mm10. liftUp mm9 ensGenes to mm10 # history of mm9 ensGene indicates it is the same as v64 release # with v65 being identical mkdir /hive/data/genomes/mm10/bed/ensGene cd /hive/data/genomes/mm10/bed/ensGene hgsql -N -e "select * from ensGene;" mm9 | cut -f2- > mm9.ensGene.gp liftOver -genePred mm9.ensGene.gp \ /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz \ mm10.lifted.ensGene.gp unmapped.ensGene.gp wc -l *.gp # 95651 mm10.lifted.ensGene.gp # 95883 mm9.ensGene.gp # 464 unmapped.ensGene.gp hgLoadGenePred -skipInvalid -genePredExt mm10 ensGene mm10.lifted.ensGene.gp # Warning: skipping 118 invalid genePreds # make a list of what did get loaded: hgsql -N -e "select name from ensGene;" mm10 \ | sort -u > mm10.name.ensGene.txt wc -l mm10.name.ensGene.txt # 95533 mm10.name.ensGene.txt hgsql -N -e "select * from ensPep;" mm9 | sort > mm9.ensPep.tab hgsql -N -e "select * from ensGtp;" mm9 | sort -k2,2 > mm9.ensGtp.tab hgsql -N -e "select * from ensemblToGeneName;" mm9 | sort -k1,1 \ > mm9.ensemblToGeneName.tab hgsql -N -e "select * from ensemblSource;" mm9 | sort -k1,1 \ > mm9.ensemblSource.tab # select out ensGtp records that match with the names in mm10 ensGene: join -1 2 -2 1 -o "1.1,1.2,1.3" mm9.ensGtp.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensGtp.tab wc -l *.ensGtp.tab # 95533 mm10.ensGtp.tab # 95883 mm9.ensGtp.tab # select out ensPep records that match with the names in mm10 ensGene: join -1 1 -2 2 -o "1.1,1.2" mm9.ensPep.tab mm10.ensGtp.tab \ | tr '[ ]' '[\t]' > mm10.ensPep.tab wc -l mm9.ensPep.tab mm10.ensPep.tab # 55798 mm9.ensPep.tab # 55485 mm10.ensPep.tab # select out ensemblSource records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblSource.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensemblSource.tab wc -l mm9.ensemblSource.tab mm10.ensemblSource.tab 95883 mm9.ensemblSource.tab 95533 mm10.ensemblSource.tab # select out ensemblToGeneName records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblToGeneName.tab \ mm10.name.ensGene.txt | tr '[ ]' '[\t]' > mm10.ensemblToGeneName.tab wc -l mm9.ensemblToGeneName.tab mm10.ensemblToGeneName.tab # 95883 mm9.ensemblToGeneName.tab # 95533 mm10.ensemblToGeneName.tab hgPepPred mm10 tab ensPep mm10.ensPep.tab hgLoadSqlTab mm10 ensGtp ~/kent/src/hg/lib/ensGtp.sql mm10.ensGtp.tab sed -e "s/15/18/" ~/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql hgLoadSqlTab mm10 ensemblSource ensemblSource.sql mm10.ensemblSource.tab # find sizes for indexes NL=`awk '{print length($1)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` VL=`awk '{print length($2)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` # construct sql definition with appropriate index sizes sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \ ~/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql hgLoadSqlTab mm10 ensemblToGeneName ensemblToGeneName.sql \ mm10.ensemblToGeneName.tab hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("mm10", "ensGene", "hiram", "65", now(), \ "lifted from mm9 ensGene 65", \ "lifted from mm9 ensGene 65", \ "dec2011" );' hgFixed ######################################################################### # Swap lastz Human hg19 (DONE - 2012-03-08 - Hiram) # original alignment to hg19 cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07 cat fb.hg19.chainMm10Link.txt # 1021265143 bases of 2897316137 (35.249%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.hg19.swap cd /hive/data/genomes/mm10/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m32.794s cat fb.mm10.chainHg19Link.txt # 1014045890 bases of 2652783500 (38.226%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.hg19.swap lastz.hg19 ######################################################################### # LASTZ RAT Rn4 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # Specially tuned blastz parameters from Webb Miller BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 SEQ2_DIR=/scratch/data/rn4/rn4.2bit SEQ2_LEN=/scratch/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn4Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # real 129m48.444s cat fb.mm10.chainRn4Link.txt # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn4.2012-03-08 lastz.rn4 # and the swap mkdir /hive/data/genomes/rn4/bed/blastz.mm10.swap cd /hive/data/genomes/rn4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 71m10.645s cat fb.rn4.chainMm10Link.txt # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Gorilla gorGor3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cat << '_EOF_' > DEF # gorilla vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gorilla GorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10GorGor3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 625m17.180s cat fb.mm10.chainGorGor3Link.txt # 901610588 bases of 2652783500 (33.987%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGorGor3.2012-03-08 lastz.gorGor3 mkdir /hive/data/genomes/gorGor3/bed/blastz.mm10.swap cd /hive/data/genomes/gorGor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 91m3.616s cat fb.gorGor3.chainMm10Link.txt # 969595533 bases of 2822760080 (34.349%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gorGor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu1 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu1 SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 556m26.589s cat fb.mm10.chainNomLeu1Link.txt # 905455766 bases of 2652783500 (34.132%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu1.2012-03-08 lastz.nomLeu1 mkdir /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 66m50.839s cat fb.nomLeu1.chainMm10Link.txt # 892362811 bases of 2756591777 (32.372%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Rhesus rheMac3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cat << '_EOF_' > DEF # rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac3 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10RheMac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 596m55.622s cat fb.mm10.chainRheMac3Link.txt # 900117108 bases of 2652783500 (33.931%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRheMac3.2012-03-08 lastz.rheMac3 mkdir /hive/data/genomes/rheMac3/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m5.839s cat fb.rheMac3.chainMm10Link.txt # 883164992 bases of 2639145830 (33.464%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Baboon papHam1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cat << '_EOF_' > DEF # baboon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Baboon PapHam1 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit SEQ2_LEN=/scratch/data/papHam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PapHam1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1138m52.716s cat fb.mm10.chainPapHam1Link.txt # 890718423 bases of 2652783500 (33.577%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPapHam1.2012-03-09 lastz.papHam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 time doRecipBest.pl mm10 papHam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 899m48.908s mkdir /hive/data/genomes/papHam1/bed/blastz.mm10.swap cd /hive/data/genomes/papHam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 548m15.438s cat fb.mm10.chainPapHam1Link.txt # 878016290 bases of 2741867288 (32.023%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/papHam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap ponAbe2 lastz (DONE - 2012-03-09 - Hiram) # original alignment result: cd /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 cat fb.ponAbe2.chainMm10Link.txt # 946932454 bases of 3093572278 (30.610%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap cd /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m38.550s cat fb.mm10.chainPonAbe2Link.txt # 915093866 bases of 2652783500 (34.496%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.ponAbe2.swap lastz.ponAbe2 ############################################################################## # LASTZ Squirrel monkey saiBol1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cat << '_EOF_' > DEF # squirrel monkey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Squirrel monkey SaiBol1 SEQ2_DIR=/hive/data/genomes/saiBol1/saiBol1.2bit SEQ2_LEN=/hive/data/genomes/saiBol1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SaiBol1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 538m42.643s cat fb.mm10.chainSaiBol1Link.txt # 857872391 bases of 2652783500 (32.339%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSaiBol1.2012-03-09 lastz.saiBol1 mkdir /hive/data/genomes/saiBol1/bed/blastz.mm10.swap cd /hive/data/genomes/saiBol1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 59m36.306s cat fb.saiBol1.chainMm10Link.txt # 838457857 bases of 2477131095 (33.848%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/saiBol1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Marmoset calJac3 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cat << '_EOF_' > DEF # marmoset vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Marmoset monkey CalJac3 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10CalJac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 529m39.657s cat fb.mm10.chainCalJac3Link.txt # 860830771 bases of 2652783500 (32.450%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCalJac3.2012-03-09 lastz.calJac3 mkdir /hive/data/genomes/calJac3/bed/blastz.mm10.swap cd /hive/data/genomes/calJac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 67m21.635s cat fb.calJac3.chainMm10Link.txt # 861565545 bases of 2752505800 (31.301%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/calJac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Chimp PanTro4 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cat << '_EOF_' > DEF # chimp vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PanTro4 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 682m53.046s cat fb.mm10.chainPanTro4Link.txt # 919836299 bases of 2652783500 (34.674%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPanTro4.2012-03-09 lastz.panTro4 mkdir /hive/data/genomes/panTro4/bed/blastz.mm10.swap cd /hive/data/genomes/panTro4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 73m23.855s cat fb.panTro4.chainMm10Link.txt # 926540065 bases of 2902338967 (31.924%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/panTro4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tarsier tarSyr1 (DONE - 2012-03-10 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cat << '_EOF_' > DEF # tarsier vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tarsier TarSyr1 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10TarSyr1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2457m45.759s cat fb.mm10.chainTarSyr1Link.txt # 651517559 bases of 2652783500 (24.560%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTarSyr1.2012-03-10 lastz.tarSyr1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 time doRecipBest.pl mm10 tarSyr1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1176m19.336s mkdir /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 746m30.852s cat fb.tarSyr1.chainMm10Link.txt # 691746721 bases of 2768536343 (24.986%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tarSyr1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap chimp panTro3 to Mm10 (DONE - 2012-03-12 - Hiram) # original alignment on panTro3 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 cat fb.panTro3.chainMm10Link.txt # 929073028 bases of 2900529764 (32.031%) in intersection # and this swap: mkdir /hive/data/genomes/mm10/bed/blastz.panTro3.swap cd /hive/data/genomes/mm10/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 68m46.408s cat fb.mm10.chainPanTro3Link.txt # 922491113 bases of 2652783500 (34.774%) in intersection ############################################################################## # LASTZ bushbaby otoGar3 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # bushbaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: bushbaby OtoGar3 SEQ2_DIR=/hive/data/genomes/otoGar3/otoGar3.2bit SEQ2_LEN=/hive/data/genomes/otoGar3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10OtoGar3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 757m32.438s cat fb.mm10.chainOtoGar3Link.txt # 790408953 bases of 2652783500 (29.795%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOtoGar3.2012-03-13 lastz.otoGar3 mkdir /hive/data/genomes/otoGar3/bed/blastz.mm10.swap cd /hive/data/genomes/otoGar3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 61m18.952s cat fb.otoGar3.chainMm10Link.txt # 776907989 bases of 2359530453 (32.926%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/otoGar3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ mouse lemur micMur1 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # mouse lemur vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: mouse lemur MicMur1 SEQ2_DIR=/scratch/data/micMur1/micMur1.2bit SEQ2_LEN=/scratch/data/micMur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10MicMur1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 687m41.863s cat fb.mm10.chainMicMur1Link.txt # 706607444 bases of 2652783500 (26.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMicMur1.2012-03-13 lastz.micMur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 time doRecipBest.pl mm10 micMur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 877m18.105s mkdir /hive/data/genomes/micMur1/bed/blastz.mm10.swap cd /hive/data/genomes/micMur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 116m54.411s cat fb.micMur1.chainMm10Link.txt # 696025630 bases of 1852394361 (37.574%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/micMur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ squirrel speTri2 (DONE - 2012-03-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # squirrel vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: squirrel SpeTri2 SEQ2_DIR=/hive/data/genomes/speTri2/speTri2.2bit SEQ2_LEN=/hive/data/genomes/speTri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SpeTri2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 935m27.893s cat fb.mm10.chainSpeTri2Link.txt # 907715417 bases of 2652783500 (34.217%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSpeTri2.2012-03-15 lastz.speTri2 mkdir /hive/data/genomes/speTri2/bed/blastz.mm10.swap cd /hive/data/genomes/speTri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m41.819s # real 116m54.411s cat fb.speTri2.chainMm10Link.txt # 906956512 bases of 2311060300 (39.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/speTri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ kangaroo rat dipOrd1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10DipOrd1 mkdir /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # kangaroo rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: kangaroo rat DipOrd1 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 867m19.972s cat fb.mm10.chainDipOrd1Link.txt # 516232678 bases of 2652783500 (19.460%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDipOrd1.2012-03-15 lastz.dipOrd1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 time doRecipBest.pl mm10 dipOrd1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 914m20.405s mkdir /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap cd /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 115m1.497s cat fb.dipOrd1.chainMm10Link.txt # 507580668 bases of 1844961421 (27.512%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dipOrd1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Naked mole-rat hetGla1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10HetGla1 mkdir /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Naked mole-rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Naked mole-rat HetGla1 SEQ2_DIR=/scratch/data/hetGla1/hetGla1.2bit SEQ2_LEN=/scratch/data/hetGla1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 745m15.097s cat fb.mm10.chainHetGla1Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla1.2012-03-15 lastz.hetGla1 mkdir /hive/data/genomes/hetGla1/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m26.471s cat fb.hetGla1.chainMm10Link.txt # 885195861 bases of 2430064805 (36.427%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ horse equCab2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EquCab2 mkdir /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # horse vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: horse EquCab2 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 566m34.024s cat fb.mm10.chainEquCab2Link.txt # 912967841 bases of 2652783500 (34.415%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEquCab2.2012-03-16 lastz.equCab2 mkdir /hive/data/genomes/equCab2/bed/blastz.mm10.swap cd /hive/data/genomes/equCab2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 87m2.261s cat fb.equCab2.chainMm10Link.txt # 901995882 bases of 2428790173 (37.138%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/equCab2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ guinea pig cavPor3 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CavPor3 mkdir /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: guinea pig CavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1523m35.729s cat fb.mm10.chainCavPor3Link.txt # 754642254 bases of 2652783500 (28.447%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCavPor3.2012-03-16 lastz.cavPor3 mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 80m23.870s cat fb.cavPor3.chainMm10Link.txt # 775452752 bases of 2663369733 (29.115%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10VicPac1 mkdir /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac1 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2049m38.674s cat fb.mm10.chainVicPac1Link.txt # 600477253 bases of 2652783500 (22.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzVicPac1.2012-03-16 lastz.vicPac1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 time doRecipBest.pl mm10 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 824m37.107s mkdir /hive/data/genomes/vicPac1/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 159m21.952s cat fb.vicPac1.chainMm10Link.txt # 610885692 bases of 1922910435 (31.769%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru1 mkdir /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru1 SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit SEQ2_LEN=/scratch/data/turTru1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1484m14.609s cat fb.mm10.chainTurTru1Link.txt # 762961671 bases of 2652783500 (28.761%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru1.2012-03-16 lastz.turTru1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 time doRecipBest.pl mm10 turTru1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 733m37.272s mkdir /hive/data/genomes/turTru1/bed/blastz.mm10.swap cd /hive/data/genomes/turTru1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 79m38.703s cat fb.turTru1.chainMm10Link.txt # 744359707 bases of 2298444090 (32.385%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tree shrew tupBel1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TupBel1 mkdir /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tree shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tree shrew TupBel1 SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1731m30.449s cat fb.mm10.chainTupBel1Link.txt # 524337666 bases of 2652783500 (19.766%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTupBel1.2012-03-16 lastz.tupBel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 time doRecipBest.pl mm10 tupBel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1090m30.429s mkdir /hive/data/genomes/tupBel1/bed/blastz.mm10.swap cd /hive/data/genomes/tupBel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 136m7.163s cat fb.tupBel1.chainMm10Link.txt # 537379661 bases of 2137225476 (25.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tupBel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pig susScr2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr2 mkdir /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1272m57.727s cat fb.mm10.chainSusScr2Link.txt # 616716602 bases of 2652783500 (23.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr2.2012-03-16 lastz.susScr2 mkdir /hive/data/genomes/susScr2/bed/blastz.mm10.swap cd /hive/data/genomes/susScr2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m47.465s cat fb.susScr2.chainMm10Link.txt # 656498040 bases of 2231298548 (29.422%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rabbit oryCun2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryCun2 mkdir /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rabbit vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rabbit OryCun2 SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1412m58.641s cat fb.mm10.chainOryCun2Link.txt # 669778489 bases of 2652783500 (25.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryCun2.2012-03-16 lastz.oryCun2 mkdir /hive/data/genomes/oryCun2/bed/blastz.mm10.swap cd /hive/data/genomes/oryCun2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 64m40.959s cat fb.oryCun2.chainMm10Link.txt # 668643668 bases of 2604023284 (25.677%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryCun2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sloth choHof1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChoHof1 mkdir /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sloth vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sloth ChoHof1 SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit SEQ2_LEN=/scratch/data/choHof1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # rebooted hgwdev during first swarm run, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # Elapsed time: 65m26s cat fb.mm10.chainChoHof1Link.txt # 477994856 bases of 2652783500 (18.019%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChoHof1.2012-03-19 lastz.choHof1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 time doRecipBest.pl mm10 choHof1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m56.481s mkdir /hive/data/genomes/choHof1/bed/blastz.mm10.swap cd /hive/data/genomes/choHof1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 1613m3.348s cat fb.choHof1.chainMm10Link.txt # 488047499 bases of 2060419685 (23.687%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/choHof1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ megabat pteVam1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PteVam1 mkdir /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # megabat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: megabat PteVam1 SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1843m33.186s cat fb.mm10.chainPteVam1Link.txt # 725414059 bases of 2652783500 (27.345%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPteVam1.2012-03-19 lastz.pteVam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 time doRecipBest.pl mm10 pteVam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 743m57.901s mkdir /hive/data/genomes/pteVam1/bed/blastz.mm10.swap cd /hive/data/genomes/pteVam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 75m35s cat fb.pteVam1.chainMm10Link.txt # 710519911 bases of 1839436660 (38.627%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/pteVam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ elephant loxAfr3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LoxAfr3 mkdir /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # elephant vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: elephant LoxAfr3 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1848m11.111s cat fb.mm10.chainLoxAfr3Link.txt # 685029753 bases of 2652783500 (25.823%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLoxAfr3.2012-03-19 lastz.loxAfr3 mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap cd /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 73m14s cat fb.loxAfr3.chainMm10Link.txt # 674108752 bases of 3118565340 (21.616%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/loxAfr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat4 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat4 mkdir /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat4 SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2010m48.963s cat fb.mm10.chainFelCat4Link.txt # 637531191 bases of 2652783500 (24.033%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat4.2012-03-19 lastz.felCat4 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 time doRecipBest.pl mm10 felCat4 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1135m12.207s mkdir /hive/data/genomes/felCat4/bed/blastz.mm10.swap cd /hive/data/genomes/felCat4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 88m12s cat fb.felCat4.chainMm10Link.txt # 616167655 bases of 1990635005 (30.953%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ panda ailMel1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AilMel1 mkdir /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # panda vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: panda AilMel1 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium # real 1914m15.921s cat fb.mm10.chainAilMel1Link.txt # 821806974 bases of 2652783500 (30.979%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAilMel1.2012-03-19 lastz.ailMel1 mkdir /hive/data/genomes/ailMel1/bed/blastz.mm10.swap cd /hive/data/genomes/ailMel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 65m50s cat fb.ailMel1.chainMm10Link.txt # 798482731 bases of 2245312831 (35.562%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ailMel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dog canFam3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CanFam3 mkdir /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dog CanFam3 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1883m21.850s cat fb.mm10.chainCanFam3Link.txt # 773114990 bases of 2652783500 (29.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCanFam3.2012-03-19 lastz.canFam3 mkdir /hive/data/genomes/canFam3/bed/blastz.mm10.swap cd /hive/data/genomes/canFam3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 63m22s cat fb.canFam3.chainMm10Link.txt # 756678903 bases of 2392715236 (31.624%) in intersectio # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov2 mkdir /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov2 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2655m49.904s cat fb.mm10.chainDasNov2Link.txt # 451070039 bases of 2652783500 (17.004%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov2.2012-03-21 lastz.dasNov2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 time doRecipBest.pl mm10 dasNov2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1163m1.023s mkdir /hive/data/genomes/dasNov2/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 201m9.701s cat fb.dasNov2.chainMm10Link.txt # 461142417 bases of 2371493872 (19.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ microbat myoLuc2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MyoLuc2 mkdir /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # microbat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: microbat MyoLuc2 SEQ2_DIR=/scratch/data/myoLuc2/myoLuc2.2bit SEQ2_LEN=/scratch/data/myoLuc2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1033m38.184s cat fb.mm10.chainMyoLuc2Link.txt # 646292112 bases of 2652783500 (24.363%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMyoLuc2.2012-03-21 lastz.myoLuc2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 time doRecipBest.pl mm10 myoLuc2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 29m16.249s mkdir /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap cd /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 54m5.607s cat fb.myoLuc2.chainMm10Link.txt # 661704053 bases of 1966419868 (33.650%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/myoLuc2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cow bosTau7 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10BosTau7 mkdir /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1151m20.445s cat fb.mm10.chainBosTau7Link.txt # 696498363 bases of 2652783500 (26.255%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau7.2012-03-21 lastz.bosTau7 mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 77m58.759s cat fb.bosTau7.chainMm10Link.txt # 711923052 bases of 2804673174 (25.383%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sheep oviAri1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OviAri1 mkdir /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sheep vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 892m33.068s cat fb.mm10.chainOviAri1Link.txt # 406955832 bases of 2652783500 (15.341%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOviAri1.2012-03-21 lastz.oviAri1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 time doRecipBest.pl mm10 oviAri1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1183m43.488s mkdir /hive/data/genomes/oviAri1/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 30m5.554s cat fb.oviAri1.chainMm10Link.txt # 383499897 bases of 1201271277 (31.925%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oviAri1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rock hyrax proCap1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ProCap1 mkdir /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rock hyrax vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rock hyrax ProCap1 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit SEQ2_LEN=/scratch/data/proCap1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=600 BASE=/hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2859m51.317s cat fb.mm10.chainProCap1Link.txt # 401804601 bases of 2652783500 (15.147%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzProCap1.2012-03-21 lastz.proCap1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 time doRecipBest.pl mm10 proCap1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1083m57.139s mkdir /hive/data/genomes/proCap1/bed/blastz.mm10.swap cd /hive/data/genomes/proCap1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 165m10.285s cat fb.proCap1.chainMm10Link.txt # 390409777 bases of 2407847681 (16.214%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/proCap1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pika ochPri2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OchPri2 mkdir /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pika vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pika OchPri2 SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2578m43.648s cat fb.mm10.chainOchPri2Link.txt # 385766335 bases of 2652783500 (14.542%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOchPri2.2012-03-22 lastz.ochPri2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 time doRecipBest.pl mm10 ochPri2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1036m29.080s mkdir /hive/data/genomes/ochPri2/bed/blastz.mm10.swap cd /hive/data/genomes/ochPri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m34.369s cat fb.ochPri2.chainMm10Link.txt # 382959642 bases of 1923624051 (19.908%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ochPri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ hedgehog eriEur1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EriEur1 mkdir /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # hedgehog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: hedgehog EriEur1 SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3006m41.470s cat fb.mm10.chainEriEur1Link.txt # 261447061 bases of 2652783500 (9.856%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEriEur1.2012-03-22 lastz.eriEur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 time doRecipBest.pl mm10 eriEur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m41.349s mkdir /hive/data/genomes/eriEur1/bed/blastz.mm10.swap cd /hive/data/genomes/eriEur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 241m24.183s cat fb.eriEur1.chainMm10Link.txt # 261605017 bases of 2133134836 (12.264%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/eriEur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tenrec echTel1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EchTel1 mkdir /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tenrec vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tenrec EchTel1 SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit SEQ2_LEN=/scratch/data/echTel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3047m28.723s cat fb.mm10.chainEchTel1Link.txt # 290413150 bases of 2652783500 (10.947%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEchTel1.2012-03-22 lastz.echTel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 time doRecipBest.pl mm10 echTel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1201m39.275s mkdir /hive/data/genomes/echTel1/bed/blastz.mm10.swap cd /hive/data/genomes/echTel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 269m52.619s cat fb.echTel1.chainMm10Link.txt # 298082139 bases of 2111581369 (14.117%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ shrew sorAra1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SorAra1 mkdir /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: shrew SorAra1 SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2600m22.528s cat fb.mm10.chainSorAra1Link.txt # 248874412 bases of 2652783500 (9.382%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSorAra1.2012-03-22 lastz.sorAra1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 time doRecipBest.pl mm10 sorAra1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1074m22.651s mkdir /hive/data/genomes/sorAra1/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 141m38.806s cat fb.sorAra1.chainMm10Link.txt # 248692550 bases of 1832864697 (13.569%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ wallaby macEug2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MacEug2 mkdir /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # wallaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: wallaby MacEug2 SEQ2_DIR=/scratch/data/macEug2/macEug2.2bit SEQ2_LEN=/scratch/data/macEug2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2893m50.341s cat fb.mm10.chainMacEug2Link.txt # 115481931 bases of 2652783500 (4.353%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMacEug2.2012-03-22 lastz.macEug2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 time doRecipBest.pl mm10 macEug2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1032m58.798s mkdir /hive/data/genomes/macEug2/bed/blastz.mm10.swap cd /hive/data/genomes/macEug2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 130m7.404s cat fb.macEug2.chainMm10Link.txt # 112811810 bases of 2536076957 (4.448%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/macEug2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ RAT Rn5 (DONE - 2012-03-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Rn5 mkdir /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cd /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # From tuning experiment between mouse chr12:15000000-25000000 and # rat chr6:38000000-48000000 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=5000 BLASTZ_T=2 BLASTZ_K=3000 BLASTZ_L=3000 BLASTZ_Q=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/mouse_rat_2.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_SMSK=/scratch/data/mm10/notInRat SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn5 SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ2_SMSK=/hive/data/genomes/rn5/bed/linSpecRep/notInMouse SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn5Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # broken lastz run when SMSK files did not exist for some of the # Rn5 contigs - made empty files for those and completed, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > cat.log 2>&1 & # real 285m28.458s cat fb.mm10.chainRn5Link.txt # 1786721927 bases of 2652783500 (67.353%) in intersection # FYI: rn4 was: # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn5.2012-03-23 lastz.rn5 # and the swap mkdir /hive/data/genomes/rn5/bed/blastz.mm10.swap cd /hive/data/genomes/rn5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 121m21.029s cat fb.rn5.chainMm10Link.txt # 1808154679 bases of 2572853723 (70.278%) in intersection # FYI, rn4 was: # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Manatee triMan1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TriMan1 mkdir /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # manatee vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: manatee TriMan1 SEQ2_DIR=/hive/data/genomes/triMan1/triMan1.2bit SEQ2_LEN=/hive/data/genomes/triMan1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1455m24.772s cat fb.mm10.chainTriMan1Link.txt # 704207702 bases of 2652783500 (26.546%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTriMan1.2012-03-29 lastz.triMan1 mkdir /hive/data/genomes/triMan1/bed/blastz.mm10.swap cd /hive/data/genomes/triMan1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m33.530s cat fb.triMan1.chainMm10Link.txt # 682557025 bases of 2769099677 (24.649%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/triMan1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Opossum monDom5 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MonDom5 mkdir /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. opossum BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Opossum monDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/scratch/data/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # Can't do this when there are only the single small set of chroms time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1792m40.071s cat fb.mm10.chainMonDom5Link.txt # 254245903 bases of 2652783500 (9.584%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMonDom5.2012-03-29 lastz.monDom5 # and for the swap mkdir /hive/data/genomes/monDom5/bed/blastz.mm10.swap cd /hive/data/genomes/monDom5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 73m49.230s cat fb.monDom5.chainMm10Link.txt # 252291401 bases of 3501660299 (7.205%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Tasmanian Devil sarHar1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SarHar1 mkdir /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. tasmanian devil BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tasmanian devil sarHar1 SEQ2_DIR=/scratch/data/sarHar1/sarHar1.2bit SEQ2_LEN=/scratch/data/sarHar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1208m55.866s cat fb.mm10.chainSarHar1Link.txt # 224935746 bases of 2652783500 (8.479%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSarHar1.2012-03-29 lastz.sarHar1 # and for the swap mkdir /hive/data/genomes/sarHar1/bed/blastz.mm10.swap cd /hive/data/genomes/sarHar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 45m53.015s cat fb.sarHar1.chainMm10Link.txt # 231249436 bases of 2931539702 (7.888%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sarHar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz budgerigar melUnd1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelUnd1 mkdir /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. budgerigar BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: budgerigar melUnd1 SEQ2_DIR=/hive/data/genomes/melUnd1/melUnd1.2bit SEQ2_LEN=/hive/data/genomes/melUnd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 883m58.198s cat fb.mm10.chainMelUnd1Link.txt # 95217653 bases of 2652783500 (3.589%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelUnd1.2012-03-29 lastz.melUnd1 # and for the swap mkdir /hive/data/genomes/melUnd1/bed/blastz.mm10.swap cd /hive/data/genomes/melUnd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 9m9.260s cat fb.melUnd1.chainMm10Link.txt # 79867911 bases of 1086614815 (7.350%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melUnd1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz platypus ornAna1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OrnAna1 mkdir /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. platypus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1264m1.056s cat fb.mm10.chainOrnAna1Link.txt # 141873792 bases of 2652783500 (5.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOrnAna1.2012-03-29 lastz.ornAna1 # and for the swap mkdir /hive/data/genomes/ornAna1/bed/blastz.mm10.swap cd /hive/data/genomes/ornAna1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 49m45.308s cat fb.ornAna1.chainMm10Link.txt # 135101083 bases of 1842236818 (7.334%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ornAna1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turtle chrPic1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChrPic1 mkdir /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turtle chrPic1 SEQ2_DIR=/hive/data/genomes/chrPic1/chrPic1.2bit SEQ2_LEN=/hive/data/genomes/chrPic1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1243m2.518s cat fb.mm10.chainChrPic1Link.txt # 125499965 bases of 2652783500 (4.731%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChrPic1.2012-03-29 lastz.chrPic1 # and for the swap mkdir /hive/data/genomes/chrPic1/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 19m26.835s cat fb.chrPic1.chainMm10Link.txt # 118436838 bases of 2158289746 (5.488%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/chrPic1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz chicken galGal4 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GalGal4 mkdir /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal4 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 109m21.068s # broken swarm cluster, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 57m24.155s cat fb.mm10.chainGalGal4Link.txt # 97510773 bases of 2652783500 (3.676%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGalGal4.2012-04-02 lastz.galGal4 # and for the swap mkdir /hive/data/genomes/galGal4/bed/blastz.mm10.swap cd /hive/data/genomes/galGal4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 95m50.996s cat fb.galGal4.chainMm10Link.txt # 83660034 bases of 1032854810 (8.100%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebra finch taeGut1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TaeGut1 mkdir /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebra finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebra finch taeGut1 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=5 BASE=/hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 106m11.612s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 29m11.090s cat fb.mm10.chainTaeGut1Link.txt # 95469341 bases of 2652783500 (3.599%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTaeGut1.2012-04-02 lastz.taeGut1 # and for the swap mkdir /hive/data/genomes/taeGut1/bed/blastz.mm10.swap cd /hive/data/genomes/taeGut1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 37m17.483s cat fb.taeGut1.chainMm10Link.txt # 89312133 bases of 1222864691 (7.304%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/taeGut1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lizard anoCar2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AnoCar2 mkdir /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lizard BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lizard anoCar2 SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 103m17.133s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 43m2.183s cat fb.mm10.chainAnoCar2Link.txt # 88356459 bases of 2652783500 (3.331%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAnoCar2.2012-04-02 lastz.anoCar2 # and for the swap mkdir /hive/data/genomes/anoCar2/bed/blastz.mm10.swap cd /hive/data/genomes/anoCar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 97m50.599s cat fb.anoCar2.chainMm10Link.txt # 84865552 bases of 1701353770 (4.988%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/anoCar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turkey melGal1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelGal1 mkdir /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turkey melGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 101m17.902s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 20m47.771s cat fb.mm10.chainMelGal1Link.txt # 93132953 bases of 2652783500 (3.511%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelGal1.2012-04-02 lastz.melGal1 # and for the swap mkdir /hive/data/genomes/melGal1/bed/blastz.mm10.swap cd /hive/data/genomes/melGal1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 88m39.591s cat fb.melGal1.chainMm10Link.txt # 76848161 bases of 935922386 (8.211%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melGal1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz frog xenTro3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenTro3 mkdir /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 99m10.611s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 37m52.678s cat fb.mm10.chainXenTro3Link.txt # 82900338 bases of 2652783500 (3.125%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzXenTro3.2012-04-02 lastz.xenTro3 # and for the swap mkdir /hive/data/genomes/xenTro3/bed/blastz.mm10.swap cd /hive/data/genomes/xenTro3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 53m19.485s cat fb.xenTro3.chainMm10Link.txt # 90345130 bases of 1358334882 (6.651%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/xenTro3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz coelacanth latCha1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LatCha1 mkdir /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. coelacanth BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: coelacanth latCha1 SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 95m34.477s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 214m7.324s cat fb.mm10.chainLatCha1Link.txt # 72036116 bases of 2652783500 (2.715%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLatCha1.2012-04-02 lastz.latCha1 # and for the swap mkdir /hive/data/genomes/latCha1/bed/blastz.mm10.swap cd /hive/data/genomes/latCha1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 14m44.600s cat fb.latCha1.chainMm10Link.txt # 73798131 bases of 2183592768 (3.380%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/latCha1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz atlantic cod gadMor1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GadMor1 mkdir /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. atlantic cod BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: atlantic cod gadMor1 SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 91m23.642s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 39m41.194s cat fb.mm10.chainGadMor1Link.txt # 45795692 bases of 2652783500 (1.726%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGadMor1.2012-04-02 lastz.gadMor1 # and for the swap mkdir /hive/data/genomes/gadMor1/bed/blastz.mm10.swap cd /hive/data/genomes/gadMor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 62m58.963s cat fb.gadMor1.chainMm10Link.txt # 41406507 bases of 608038597 (6.810%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gadMor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz nile tilapia oreNil1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil1 mkdir /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil1 SEQ2_DIR=/scratch/data/oreNil1/oreNil1.2bit SEQ2_LEN=/scratch/data/oreNil1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 89m6.727s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 24m3.960s cat fb.mm10.chainOreNil1Link.txt # 51915568 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil1.2012-04-02 lastz.oreNil1 # and for the swap mkdir /hive/data/genomes/oreNil1/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 90m55.298s cat fb.oreNil1.chainMm10Link.txt # 49709461 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz stickleback gasAcu1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GasAcu1 mkdir /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. stickleback BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: stickleback gasAcu1 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 87m5.963s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 9m49.199s cat fb.mm10.chainGasAcu1Link.txt # 53469711 bases of 2652783500 (2.016%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGasAcu1.2012-04-02 lastz.gasAcu1 # and for the swap mkdir /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap cd /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 12m58.072s cat fb.gasAcu1.chainMm10Link.txt # 48802831 bases of 446627861 (10.927%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gasAcu1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz fugu fr3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Fr3 mkdir /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. fugu BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: fugu fr3 SEQ2_DIR=/scratch/data/fr3/fr3.2bit SEQ2_LEN=/scratch/data/fr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 84m37.070s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 171m16.627s cat fb.mm10.chainFr3Link.txt # 47460021 bases of 2652783500 (1.789%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFr3.2012-04-02 lastz.fr3 # and for the swap mkdir /hive/data/genomes/fr3/bed/blastz.mm10.swap cd /hive/data/genomes/fr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m13.151s cat fb.fr3.chainMm10Link.txt # 42586058 bases of 350961831 (12.134%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/fr3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz tetraodon tetNig2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TetNig2 mkdir /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. tetraodon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tetraodon tetNig2 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 13m21.638s cat fb.mm10.chainTetNig2Link.txt # 46035322 bases of 2652783500 (1.735%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTetNig2.2012-04-02 lastz.tetNig2 # and for the swap mkdir /hive/data/genomes/tetNig2/bed/blastz.mm10.swap cd /hive/data/genomes/tetNig2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m24.115s cat fb.tetNig2.chainMm10Link.txt # 41242926 bases of 302314788 (13.642%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tetNig2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebrafish danRer7 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DanRer7 mkdir /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 80m32.118s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 40m27.762s cat fb.mm10.chainDanRer7Link.txt # 69028912 bases of 2652783500 (2.602%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDanRer7.2012-04-02 lastz.danRer7 # and for the swap mkdir /hive/data/genomes/danRer7/bed/blastz.mm10.swap cd /hive/data/genomes/danRer7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 109m49.939s cat fb.danRer7.chainMm10Link.txt # 72001768 bases of 1409770109 (5.107%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/danRer7/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz medaka oryLat2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryLat2 mkdir /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. medaka BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: medaka oryLat2 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 78m53.408s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 113m29.462s cat fb.mm10.chainOryLat2Link.txt # 51344841 bases of 2652783500 (1.936%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryLat2.2012-04-02 lastz.oryLat2 # and for the swap mkdir /hive/data/genomes/oryLat2/bed/blastz.mm10.swap cd /hive/data/genomes/oryLat2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m52.846s cat fb.oryLat2.chainMm10Link.txt # 45954178 bases of 700386597 (6.561%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryLat2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lamprey petMar1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PetMar1 mkdir /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/scratch/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 77m3.923s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # missing qRepeats specification rm axtChain/mm10.petMar1.net time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=load `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > load.log 2>&1 & # real 6m31.527s cat fb.mm10.chainPetMar1Link.txt # 29205053 bases of 2652783500 (1.101%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar1.2012-04-02 lastz.petMar1 # and for the swap mkdir /hive/data/genomes/petMar1/bed/blastz.mm10.swap cd /hive/data/genomes/petMar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02/DEF \ -qRepeats=windowmaskerSdust -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 17m40.196s cat fb.petMar1.chainMm10Link.txt # 26274715 bases of 831696438 (3.159%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### ## 60-Way Multiz (DONE - 2011-09-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way # from the 62-way in the source tree, do not need aliMis1 and croPor1: /cluster/bin/phast/tree_doctor --prune ailMis1,croPor1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/62way.nh > 60way.nh # note, newer assemblies: susScr3, dasNov3, felCat5, hetGla2, turTru2, # nomLeu2, oreNil2 # what that looks like: cat 60way.nh # (((((((((((((((((((hg19:0.006550,panTro4:0.006840):0.002220, # gorGor3:0.008964):0.009693,ponAbe2:0.018940):0.003471, # nomLeu2:0.022270):0.012040,(rheMac3:0.007991, # papHam1:0.008042):0.029610):0.021830,(calJac3:0.030000, # saiBol1:0.040000):0.039650):0.052090,tarSyr1:0.111400):0.020520, # (micMur1:0.085600,otoGar3:0.119400):0.020520):0.015494, # tupBel1:0.186203):0.004937,(((((mm10:0.084509,rn5:0.091589):0.197773, # dipOrd1:0.211609):0.022992,(hetGla2:0.100000, # cavPor3:0.125629):0.100000):0.010150,speTri2:0.148468):0.025746, # (oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313):0.020593, # (((susScr3:0.120000,(vicPac1:0.087275,(turTru2:0.064688, # (oviAri1:0.100000,bosTau7:0.100000):0.023592):0.025153):0.020335):0.020000, # ((equCab2:0.109397,(felCat5:0.098612, # (canFam3:0.052458,ailMel1:0.050000):0.050000):0.049845):0.006219, # (myoLuc2:0.142540,pteVam1:0.113399):0.033706):0.004508):0.011671, # (eriEur1:0.221785,sorAra1:0.269562):0.056393):0.021227):0.023664, # ((((loxAfr3:0.082242,proCap1:0.155358):0.026990,echTel1:0.245936):0.010000, # triMan1:0.100000):0.049697,(dasNov3:0.116664, # choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686, # (sarHar1:0.100000,macEug2:0.072008):0.050000):0.215100):0.071664, # ornAna1:0.456592):0.109504,(((((melGal1:0.100000,galGal4:0.065536):0.100000, # taeGut1:0.171542):0.199223,melUnd1:0.100000):0.155143, # anoCar2:0.539241):0.122371,chrPic1:0.200000):0.010000):0.050000, # xenTro3:0.855573):0.100000,latCha1:0.855573):0.311354, # ((((((tetNig2:0.224159,fr3:0.203847):0.097590,oreNil2:0.200000):0.097590, # gasAcu1:0.316413):0.030000,oryLat2:0.511970):0.030000, # gadMor1:0.350000):0.225640,danRer7:0.730752):0.147949):0.526688, # petMar1:0.526688); # rearrange to get mm10 on top: cat << '_EOF_' > mm10.60way.nh (((((((((((((((mm10:0.084509,rn5:0.091589):0.197773,dipOrd1:0.211609):0.022992, (hetGla2:0.1,cavPor3:0.125629):0.1):0.01015,speTri2:0.148468):0.025746,(oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313, (((((((((hg19:0.00655,panTro4:0.00684):0.00222,gorGor3:0.008964):0.009693,ponAbe2:0.01894):0.003471, nomLeu2:0.02227):0.01204,(rheMac3:0.007991,papHam1:0.008042):0.02961):0.02183, (calJac3:0.03,saiBol1:0.04):0.03965):0.05209,tarSyr1:0.1114):0.02052,(micMur1:0.0856,otoGar3:0.1194):0.02052):0.015494, tupBel1:0.186203):0.004937):0.020593, ((susScr3:0.12,(vicPac1:0.087275,(turTru2:0.064688, (oviAri1:0.1,bosTau7:0.1):0.023592):0.025153):0.020335):0.01, ((((felCat5:0.098612, (canFam3:0.052458,ailMel1:0.05):0.05):0.049845,equCab2:0.109397):0.006219, (myoLuc2:0.14254,pteVam1:0.113399):0.033706):0.004508,(eriEur1:0.221785, sorAra1:0.269562):0.056393):0.021227):0.01):0.013664,((((loxAfr3:0.082242,proCap1:0.155358):0.02699, echTel1:0.245936):0.01,triMan1:0.1):0.049697,(dasNov3:0.116664, choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686,(sarHar1:0.1, macEug2:0.072008):0.05):0.2151):0.071664,ornAna1:0.456592):0.109504, (((((melGal1:0.1,galGal4:0.065536):0.1,taeGut1:0.171542):0.199223,melUnd1:0.1):0.155143,anoCar2:0.539241):0.122371, chrPic1:0.2):0.01):0.05,xenTro3:0.855573):0.1,latCha1:0.855573):0.311354, ((((((tetNig2:0.224159,fr3:0.203847):0.09759,oreNil2:0.2):0.09759,gasAcu1:0.316413):0.03, oryLat2:0.51197):0.03,gadMor1:0.35):0.22564,danRer7:0.730752):0.147949):0.526688,petMar1:0.526688); '_EOF_' # << happy emacs # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" mm10.60way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | sed -e 's/X__trop/X._trop/' > mm10.60way.commonNames.nh # (((((((((((((((Mouse:0.084509,Rat:0.091589):0.197773, # Kangaroo_rat:0.211609):0.022992,(Naked_mole:0.1, # Guinea_pig:0.125629):0.1):0.01015,Squirrel:0.148468):0.025746, # (Rabbit:0.114227,Pika:0.201069):0.101463):0.015313, # (((((((((Human:0.00655,Chimp:0.00684):0.00222,Gorilla:0.008964):0.009693, # Orangutan:0.01894):0.003471,Gibbon:0.02227):0.01204, # (Chinese_rhesus:0.007991,Baboon:0.008042):0.02961):0.02183, # (Marmoset:0.03,Squirrel_monkey:0.04):0.03965):0.05209, # Tarsier:0.1114):0.02052,(Mouse_lemur:0.0856, # Bushbaby:0.1194):0.02052):0.015494,Tree_shrew:0.186203):0.004937):0.020593, # ((Pig:0.12,(Alpaca:0.087275,(Dolphin:0.064688, # (Sheep:0.1,Cow:0.1):0.023592):0.025153):0.020335):0.01, # ((((Cat:0.098612,(Dog:0.052458,Panda:0.05):0.05):0.049845, # Horse:0.109397):0.006219,(Microbat:0.14254, # Megabat:0.113399):0.033706):0.004508,(Hedgehog:0.221785, # Shrew:0.269562):0.056393):0.021227):0.01):0.013664, # ((((Elephant:0.082242,Rock_hyrax:0.155358):0.02699, # Tenrec:0.245936):0.01,Manatee:0.1):0.049697, # (Armadillo:0.116664,Sloth:0.096357):0.053145):0.006717):0.234728, # (Opossum:0.125686,(Tasmanian_devil:0.1, # Wallaby:0.072008):0.05):0.2151):0.071664,Platypus:0.456592):0.109504, # (((((Turkey:0.1,Chicken:0.065536):0.1,Zebra_finch:0.171542):0.199223, # Budgerigar:0.1):0.155143,Lizard:0.539241):0.122371, # Painted_turtle:0.2):0.01):0.05,X._tropicalis:0.855573):0.1, # Coelacanth:0.855573):0.311354,((((((Tetraodon:0.224159, # Fugu:0.203847):0.09759,Nile_tilapia:0.2):0.09759, # Stickleback:0.316413):0.03,Medaka:0.51197):0.03, # Atlantic_cod:0.35):0.22564,Zebrafish:0.730752):0.147949):0.526688, # Lamprey:0.526688); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/mm10_60way.png /cluster/bin/phast/all_dists mm10.60way.nh | grep mm10 \ | sed -e "s/mm10^I//" | sort -k2n > 60way.distances.txt # Use this output to create the table below head 60way.distances.txt # rn5 0.176098 # speTri2 0.463892 # micMur1 0.483034 # dipOrd1 0.493891 # vicPac1 0.504686 # hetGla2 0.505274 # hg19 0.505328 # gorGor3 0.505522 # panTro4 0.505618 # nomLeu2 0.505664 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<60way.distances.txt") or die "can not read 60way.distances.txt"; my $count = 0; while (my $line = <FH>) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm10/fb.${D}.chainMm10Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainAnoCar2Link # N distance on mm10 on other other species # 01 0.1761 (% 67.353) (% 70.278) - Rat rn5 # 02 0.4639 (% 34.217) (% 39.244) - Squirrel speTri2 # 03 0.4830 (% 26.636) (% 37.574) - Mouse lemur micMur1 # 04 0.4939 (% 19.460) (% 27.512) - Kangaroo rat dipOrd1 # 05 0.5047 (% 22.636) (% 31.769) - Alpaca vicPac1 # 06 0.5053 (% 32.753) (% 37.989) - Naked mole rat hetGla2 # 07 0.5053 (% 38.226) (% 35.249) - Human hg19 # 08 0.5055 (% 33.987) (% 34.349) - Gorilla gorGor3 # 09 0.5056 (% 34.674) (% 31.924) - Chimp panTro4 # 10 0.5057 (% 34.031) (% 32.274) - Gibbon nomLeu2 # 11 0.5058 (% 34.496) (% 30.610) - Orangutan ponAbe2 # 12 0.5073 (% 30.267) (% 33.492) - Dolphin turTru2 # 13 0.5088 (% 24.560) (% 24.986) - Tarsier tarSyr1 # 14 0.5090 (% 33.931) (% 33.464) - Chinese rhesus rheMac3 # 15 0.5090 (% 33.577) (% 32.023) - Baboon papHam1 # 16 0.5168 (% 29.795) (% 32.926) - Bushbaby otoGar3 # 17 0.5171 (% 25.685) (% 29.445) - Pig susScr3 # 18 0.5192 (% 32.450) (% 31.301) - Marmoset calJac3 # 19 0.5284 (% 34.415) (% 37.138) - Horse equCab2 # 20 0.5292 (% 32.339) (% 33.848) - Squirrel monkey saiBol1 # 21 0.5309 (% 28.447) (% 29.115) - Guinea pig cavPor3 # 22 0.5470 (% 18.019) (% 23.687) - Sloth choHof1 # 23 0.5472 (% 26.546) (% 24.649) - Manatee triMan1 # 24 0.5476 (% 19.766) (% 25.144) - Tree shrew tupBel1 # 25 0.5569 (% 25.248) (% 25.677) - Rabbit oryCun2 # 26 0.5599 (% 27.345) (% 38.627) - Megabat pteVam1 # 27 0.5662 (% 26.255) (% 25.383) - Cow bosTau7 # 28 0.5662 (% 15.341) (% 31.925) - Sheep oviAri1 # 29 0.5664 (% 25.823) (% 21.616) - Elephant loxAfr3 # 30 0.5673 (% 25.201) (% 21.066) - Armadillo dasNov3 # 31 0.5675 (% 29.725) (% 32.244) - Cat felCat5 # 32 0.5689 (% 30.979) (% 35.562) - Panda ailMel1 # 33 0.5713 (% 29.144) (% 31.624) - Dog canFam3 # 34 0.5891 (% 24.363) (% 33.650) - Microbat myoLuc2 # 35 0.6395 (% 15.147) (% 16.214) - Rock hyrax proCap1 # 36 0.6437 (% 14.542) (% 19.908) - Pika ochPri2 # 37 0.6865 (% 09.856) (% 12.264) - Hedgehog eriEur1 # 38 0.7031 (% 10.947) (% 14.117) - Tenrec echTel1 # 39 0.7343 (% 09.382) (% 13.569) - Shrew sorAra1 # 40 0.9626 (% 04.353) (% 04.448) - Wallaby macEug2 # 41 0.9663 (% 09.584) (% 07.205) - Opossum monDom5 # 42 0.9906 (% 08.479) (% 07.888) - Tasmanian devil sarHar1 # 43 1.0166 (% 04.731) (% 05.488) - Painted turtle chrPic1 # 44 1.1537 (% 05.348) (% 07.334) - Platypus ornAna1 # 45 1.1942 (% 03.589) (% 07.350) - Budgerigar melUnd1 # 46 1.4589 (% 03.676) (% 08.100) - Chicken galGal4 # 47 1.4649 (% 03.599) (% 07.304) - Zebra finch taeGut1 # 48 1.4782 (% 03.331) (% 04.988) - Lizard anoCar2 # 49 1.4934 (% 03.511) (% 08.211) - Turkey melGal1 # 50 1.7122 (% 03.125) (% 06.651) - X. tropicalis xenTro3 # 51 1.8122 (% 02.715) (% 03.380) - Coelacanth latCha1 # 52 1.9916 (% 01.726) (% 06.810) - Atlantic cod gadMor1 # 53 1.9992 (% 01.957) (% 06.091) - Nile tilapia oreNil2 # 54 2.0180 (% 02.016) (% 10.927) - Stickleback gasAcu1 # 55 2.1006 (% 01.789) (% 12.134) - Fugu fr3 # 56 2.1209 (% 01.735) (% 13.642) - Tetraodon tetNig2 # 57 2.1467 (% 02.602) (% 05.107) - Zebrafish danRer7 # 58 2.1835 (% 01.936) (% 06.561) - Medaka oryLat2 # 59 2.3214 (% 01.101) (% 03.159) - Lamprey petMar1 # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # bash shell syntax here ... cd /hive/data/genomes/mm10/bed/multiz60way export H=/hive/data/genomes/mm10/bed mkdir mafLinks for G in `sed -e "s/mm10 //" species.list` do mkdir mafLinks/$G if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then echo "$G - recipBest" ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then echo "$G - synNet" ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then echo "$G - mafNet" ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G else echo "missing directory lastz.${G}/*Net" fi fi fi done # verify the alignment type is correct: for D in `grep -v mm10 /hive/users/hiram/bigWays/mm10.60way/ordered.list` do ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}' done # compare to the list at: # http://genomewiki.ucsc.edu/index.php/Mm10_Genome_size_statistics # need to split these things up into smaller pieces for # efficient kluster run. cd /hive/data/genomes/mm10/bed/multiz60way mkdir mafSplit cd mafSplit # mafSplitPos splits on gaps or repeat areas that will not have # any chains, approx 5 Mbp intervals, gaps at least 10,000 mafSplitPos -minGap=10000 mm10 5 stdout | sort -u \ | sort -k1,1 -k2,2n > mafSplit.bed # There is a splitRegions.pl script here (copied from previous hg19 46way) # that can create a custom track from this mafSplit.bed file. # Take a look at that in the browser and see if it looks OK, # check the number of sections on each chrom to verify none are # too large. Despite the claim above, it does appear that some # areas are split where actual chains exist. ./splitRegions.pl mafSplit.bed > splitRegions.ct # to see the sizes of the regions: grep "^chr" splitRegions.ct | awk '{print $3-$2,$0}' | sort -rn | less # run a kluster job to split them all ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/mafSplit cat << '_EOF_' > runOne #!/bin/csh -ef set G = $1 set C = $2 mkdir -p $G pushd $G > /dev/null if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then if ( -s mm10_${C}.00.maf ) then /bin/rm -f mm10_${C}.*.maf endif /cluster/bin/x86_64/mafSplit ../mafSplit.bed mm10_ ../../mafLinks/${G}/${C}.maf.gz /bin/gzip mm10_${C}.*.maf else /bin/touch mm10_${C}.00.maf /bin/gzip mm10_${C}.00.maf endif popd > /dev/null '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ $(root1)/mm10_$(root2).00.maf.gz} #ENDLOOP '_EOF_' # << happy emacs for G in `sed -e "s/mm10 //" ../species.list` do echo $G done > species.list cut -f 1 ../../../chrom.sizes > chr.list gensub2 species.list chr.list template jobList para -ram=8g create jobList para try ... check ... push ... etc... # Completed: 3894 of 3894 jobs # CPU time in finished jobs: 18929s 315.49m 5.26h 0.22d 0.001 y # IO & Wait Time: 62908s 1048.46m 17.47h 0.73d 0.002 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 346s 5.77m 0.10h 0.00d # Submission to last job: 471s 7.85m 0.13h 0.01d # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | grep "maf.gz" | wc -l # 19733 find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 336 maf.list mkdir /hive/data/genomes/mm10/bed/multiz60way/splitRun cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = mm10 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/multiz60way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/mm10/bed/multiz60way/splitRun/maf/$(root1)} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/splitRun/run # the tac reverses the list to get the small jobs first gensub2 maf.list single template stdout | tac > jobList para -ram=8g create jobList # Completed: 336 of 336 jobs # CPU time in finished jobs: 2828651s 47144.19m 785.74h 32.74d 0.090 y # IO & Wait Time: 200533s 3342.21m 55.70h 2.32d 0.006 y # Average job time: 9015s 150.26m 2.50h 0.10d # Longest finished job: 47029s 783.82m 13.06h 0.54d # Submission to last job: 48982s 816.37m 13.61h 0.57d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir ../maf # the sed edits take out partitioning name information from the comments # so the multiple parts will condense to smaller number of lines # this takes almost 2 hours of time, resulting in a bit over 150 Gb, # almost all chrom files over 1 Gb, up to almost 10 Gb for chr2 # HOWEVER, this is actually not necessary to maintain these comments, # they are lost during the mafAddIRows cat << '_EOF_' >> runOne #!/bin/csh -fe set C = $1 if ( -s ../maf/${C}.maf.gz ) then rm -f ../maf/${C}.maf.gz endif head -q -n 1 maf/mm10_${C}.*.maf | sort -u > ../maf/${C}.maf grep -h "^#" maf/mm10_${C}.*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \ | sort -u >> ../maf/${C}.maf grep -h -v "^#" `ls maf/mm10_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf tail -q -n 1 maf/mm10_${C}.*.maf | sort -u >> ../maf/${C}.maf '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' >> template #LOOP runOne $(root1) {check out exists+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 ../../../chrom.sizes > chr.list ssh encodek cd /hive/data/genomes/mm10/bed/multiz60way/splitRun gensub2 chr.list single template jobList para -ram=8g create jobList para try ... check ... push ... etc ... # Completed: 62 of 66 jobs # Crashed: 4 jobs # CPU time in finished jobs: 461s 7.68m 0.13h 0.01d 0.000 y # IO & Wait Time: 17863s 297.72m 4.96h 0.21d 0.001 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest finished job: 1144s 19.07m 0.32h 0.01d # Submission to last job: 1156s 19.27m 0.32h 0.01d # these four have empty results: # chrUn_GL456383 # chrUn_GL456389 # chrUn_GL456390 # chrUn_GL456396 # Load into database ssh hgwdev mkdir -p /gbdb/mm10/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way/maf ln -s `pwd`/*.maf /gbdb/mm10/multiz60way # this generates an immense multiz60way.tab file in the directory # where it is running. Best to run this over in scratch. # This is going to take all day. cd /scratch/tmp time nice -n +19 hgLoadMaf mm10 multiz60way # Loaded 56185270 mafs in 66 files from /gbdb/mm10/multiz60way # real 72m45.513s # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab time cat /gbdb/mm10/multiz60way/*.maf \ | nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin # Created 12012784 summary blocks from 1074134156 components and # 56185270 mafs from stdin # real 104m2.107s wc -l multiz60way*.tab # 56185270 multiz60way.tab # 12012784 multiz60waySummary.tab # 68198054 total # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab # -rw-rw-r-- 1 567210414 Apr 18 17:28 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-05-31 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno # check for N.bed files everywhere: for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/mm10/bed/multiz60way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S mm10 # use a screen to control this longish job ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/anno mkdir result # NEXT TIME: this template should have a check out exists+ statement cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/mm10/mm10.2bit {check out line+ result/$(file1)} #ENDLOOP '_EOF_' # << happy emacs ls ../maf/*.maf > maf.list # the tac puts the short jobs first gensub2 maf.list single template stdout | tac > jobList # limit jobs to one per node with the ram=8g requirement para -ram=8g create jobList para try ... check ... push ... # Completed: 46 of 66 jobs # CPU time in finished jobs: 350s 5.83m 0.10h 0.00d 0.000 y # IO & Wait Time: 603s 10.06m 0.17h 0.01d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 54s 0.90m 0.01h 0.00d # Submission to last job: 113s 1.88m 0.03h 0.00d # a number of these jobs did not finish due to memory limitations. # The jobs would sit on the nodes appearing to occupy 8 Gb of memory, # but did not see any swapping or CPU time accumulation. Stop the # batch and run the rest manually on hgwdev: #!/bin/sh export maxMem=188743680 ulimit -S -m $maxMem -v $maxMem mafAddIRows -nBeds=nBeds ../maf/chrX.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chrX.maf & mafAddIRows -nBeds=nBeds ../maf/chr9.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr9.maf & mafAddIRows -nBeds=nBeds ../maf/chr8.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr8.maf & mafAddIRows -nBeds=nBeds ../maf/chr7.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr7.maf & wait mafAddIRows -nBeds=nBeds ../maf/chr6.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr6.maf & mafAddIRows -nBeds=nBeds ../maf/chr5.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr5.maf & mafAddIRows -nBeds=nBeds ../maf/chr4.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr4.maf & mafAddIRows -nBeds=nBeds ../maf/chr3.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr3.maf & wait ... etc ... # the run time for those 20 jobs: # real 159m49.217s # verify all result files have some content, look for 0 size files: find . -type f -size 0 # should see none # combine into one file (realized after this, that we do *not* need # this single file. Individual files are OK. head -q -n 1 result/chrM.maf > mm10.60way.maf time for F in hgwdev/*.maf result/*.maf do grep -h -v "^#" ${F} done >> mm10.60way.maf # real 1082m47.484s -> 18 hours ! # -rw-rw-r-- 1 261567878241 Jun 8 10:30 mm10.60way.maf du -hsc mm10.60way.maf # 244G mm10.60way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/chrM.maf >> mm10.60way.maf # How about an official end marker: echo "##eof maf" >> mm10.60way.maf # construct symlinks to get the individual maf files into gbdb: mkdir /gbdb/mm10/multiz60way/maf ln -s `pwd`/result/*.maf `pwd`/hgwdev/*.maf /gbdb/mm10/multiz60way/maf/ # Load into database rm /gbdb/mm10/multiz60way/*.maf # remove previous results cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm10/multiz60way/maf \ mm10 multiz60way # Loaded 58087742 mafs in 66 files from /gbdb/mm10/multiz60way/maf # real 868m28.108s time (cat /gbdb/mm10/multiz60way/maf/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin) # -rw-rw-r-- 1 3009209972 Jun 9 03:23 multiz60way.tab # -rw-rw-r-- 1 591235982 Jun 11 18:34 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # MULTIZ60WAY MAF FRAMES (DONE - 2012-05-30 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/frames cd /hive/data/genomes/mm10/bed/multiz60way/frames # survey all the genomes to find out what kinds of gene tracks they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg19 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh > showGenes.txt # real 9m11.678s # rearrange that output to create four sections, and place these names # in .list files here: # 1. knownGene: hg19 # 2. refGene: bosTau7 danRer7 galGal4 mm10 rheMac3 rn5 susScr3 xenTro3 # 3. ensGene: ailMel1 anoCar2 calJac3 cavPor3 choHof1 dipOrd1 echTel1 # equCab2 eriEur1 fr3 gasAcu1 gorGor3 loxAfr3 melGal1 # micMur1 monDom5 myoLuc2 ochPri2 ornAna1 oryCun2 oryLat2 # panTro4 ponAbe2 proCap1 pteVam1 sorAra1 taeGut1 tarSyr1 # tetNig2 tupBel1 vicPac1 # 4. xenoRefGene: canFam3 chrPic1 dasNov3 felCat5 hetGla2 latCha1 macEug2 # nomLeu2 otoGar3 oviAri1 papHam1 petMar1 saiBol1 sarHar1 # triMan1 # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 mkdir genes # 1. knownGene: hg19 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg19 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/hg19.gp.gz # 2. refGene, want the full extended genePred: for DB in `cat refGene.list` do hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 3. ensGene, want the full extended genePred: for DB in `cat ensGene.list` do hgsql -N -e "select * from ensGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 4. xenoRefGene, want the full extended genePred: for DB in `cat xenoRG.list` do hgsql -N -e "select * from xenoRefGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 # this was done in error the first time, mistakenly using # the xenoRefGene table instead of genscan for DB in `cat genscan.list` do hgsql -N -e "select * from genscan" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ailMel1.gp.gz: 19204 # genes/anoCar2.gp.gz: 17766 # genes/bosTau7.gp.gz: 12958 # genes/calJac3.gp.gz: 20843 # genes/canFam3.gp.gz: 20652 # genes/cavPor3.gp.gz: 18631 # genes/choHof1.gp.gz: 12403 # genes/chrPic1.gp.gz: 19433 # genes/danRer7.gp.gz: 13902 # genes/dasNov3.gp.gz: 29551 # genes/dipOrd1.gp.gz: 15784 # genes/echTel1.gp.gz: 16499 # genes/equCab2.gp.gz: 20403 # genes/eriEur1.gp.gz: 11712 # genes/felCat5.gp.gz: 19512 # genes/fr3.gp.gz: 18014 # genes/gadMor1.gp.gz: 27572 # genes/galGal4.gp.gz: 4892 # genes/gasAcu1.gp.gz: 20631 # genes/gorGor3.gp.gz: 20759 # genes/hetGla2.gp.gz: 25749 # genes/hg19.gp.gz: 20718 # genes/latCha1.gp.gz: 18786 # genes/loxAfr3.gp.gz: 19986 # genes/macEug2.gp.gz: 26006 # genes/melGal1.gp.gz: 14050 # genes/melUnd1.gp.gz: 15296 # genes/micMur1.gp.gz: 16240 # genes/mm10.gp.gz: 20985 # genes/monDom5.gp.gz: 19188 # genes/myoLuc2.gp.gz: 19685 # genes/nomLeu2.gp.gz: 22996 # genes/ochPri2.gp.gz: 15970 # genes/oreNil2.gp.gz: 18636 # genes/ornAna1.gp.gz: 17728 # genes/oryCun2.gp.gz: 18921 # genes/oryLat2.gp.gz: 19576 # genes/otoGar3.gp.gz: 24061 # genes/oviAri1.gp.gz: 17890 # genes/panTro4.gp.gz: 18647 # genes/papHam1.gp.gz: 27842 # genes/petMar1.gp.gz: 11089 # genes/ponAbe2.gp.gz: 19895 # genes/proCap1.gp.gz: 16043 # genes/pteVam1.gp.gz: 16966 # genes/rheMac3.gp.gz: 5580 # genes/rn5.gp.gz: 16393 # genes/saiBol1.gp.gz: 23419 # genes/sarHar1.gp.gz: 20694 # genes/sorAra1.gp.gz: 13156 # genes/speTri2.gp.gz: 22377 # genes/susScr3.gp.gz: 3771 # genes/taeGut1.gp.gz: 17354 # genes/tarSyr1.gp.gz: 13615 # genes/tetNig2.gp.gz: 19539 # genes/triMan1.gp.gz: 19514 # genes/tupBel1.gp.gz: 15407 # genes/turTru2.gp.gz: 28375 # genes/vicPac1.gp.gz: 11754 # genes/xenTro3.gp.gz: 8447 # kluster job to annotate each maf file screen -S mm10 # manage long running procedure with screen ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/frames cat << '_EOF_' > runOne #!/bin/csh -fe set C = $1 set G = $2 cat ../maf/${C}.maf | genePredToMafFrames mm10 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz '_EOF_' # << happy emacs chmod +x runOne # older instructions excluded mm10 from the gene.list # this was a mistake. mm10 can be annotated too. # Mistakenly did this the first run through, had to manually # do the mm10 genes separately on hgwdev after this was done ls ../maf | sed -e "s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP '_EOF_' # << happy emacs mkdir parts gensub2 chr.list gene.list template jobList para -ram=8g create jobList para try ... check ... push # Completed: 3960 of 3960 jobs # CPU time in finished jobs: 85610s 1426.83m 23.78h 0.99d 0.003 y # IO & Wait Time: 2030956s 33849.27m 564.15h 23.51d 0.064 y # Average job time: 534s 8.91m 0.15h 0.01d # Longest finished job: 3877s 64.62m 1.08h 0.04d # Submission to last job: 12974s 216.23m 3.60h 0.15d # collect all results into one file: cd /hive/data/genomes/mm10/bed/multiz60way/frames find ./parts -type f | while read F do zcat ${F} done | sort -k1,1 -k2,2n > multiz60wayFrames.bed # -rw-rw-r-- 1 1164299719 May 30 11:28 multiz60wayFrames.bed # verify there are frames on everything: cut -f4 multiz60wayFrames.bed | sort | uniq -c | sort -n \ > annotation.survey.txt # should be 60 species: wc -l annotation.survey.txt # 60 annotation.survey.txt # and the minimum numbers: head annotation.survey.txt # 43900 susScr3 # 59839 rheMac3 # 153246 petMar1 # 162501 choHof1 # ... etc ... # load the resulting file ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/frames time gzip multiz60wayFrames.bed # real 0m51.826s # reloading this table 2012-10-11 with more accurate frames: time hgLoadMafFrames mm10 multiz60wayFrames multiz60wayFrames.bed.gz # real 3m2.449s time featureBits -countGaps mm10 multiz60wayFrames # 57707702 bases of 2730871774 (2.113%) in intersection # real 1m45.141s # reload table to fix frames problems 2014-03-19 - Hiram time featureBits -countGaps mm10 multiz60wayFrames # 79955378 bases of 2730871774 (2.928%) in intersection # enable the trackDb entries: # frames multiz60wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 60-way (DONE - 2012-05-31 - 2012-06-12 - Hiram) mkdir /hive/data/genomes/mm10/bed/multiz60way/4d cd /hive/data/genomes/mm10/bed/multiz60way/4d # the annotated maf's are in: ../anno/result/*.maf # using ensGene for mm10, only transcribed genes and nothing # from the randoms and other misc. hgsql mm10 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \ | egrep -E -v "chrM|chrUn|random|_hap" > ensGene.gp wc -l *.gp # 55423 ensGene.gp genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 22457 ensGeneNR.gp ssh encodek mkdir /hive/data/genomes/mm10/bed/multiz60way/4d/run cd /hive/data/genomes/mm10/bed/multiz60way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/mm10/bed/multiz60way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tmm10.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/mm10/bed/multiz60way/anno/result/*.maf \ | sed -e "s#.*multiz60way/anno/result/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # the tac puts the quick jobs at the front gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 13176s 219.60m 3.66h 0.15d 0.000 y # IO & Wait Time: 31790s 529.84m 8.83h 0.37d 0.001 y # Average job time: 681s 11.36m 0.19h 0.01d # Longest finished job: 2883s 48.05m 0.80h 0.03d # Submission to last job: 2925s 48.75m 0.81h 0.03d # combine mfa files ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 ls -og mfa | awk '$3 == 1' | awk '{print $NF}' > empty.list sed -e "s#^#mfa/##" empty.list | xargs rm -f #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 60 # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 98m59.203s mv phyloFit.mod all.mod grep TREE all.mod #TREE: (((((((((((((((mm10:0.0855383,rn5:0.0922719):0.202381,dipOrd1:0.210819):0.0258471,(hetGla2:0.0917322,cavPor3:0.136876):0.0994271):0.00910944,speTri2:0.145483):0.0274969,(oryCun2:0.109639,ochPri2:0.200966):0.102067):0.0141654,(((((((((hg19:0.00674057,panTro4:0.00692231):0.00309904,gorGor3:0.00918625):0.00954082,ponAbe2:0.0191843):0.00356049,nomLeu2:0.0218207):0.0116848,(rheMac3:0.00814945,papHam1:0.0079848):0.0289473):0.0208338,(calJac3:0.0342405,saiBol1:0.0333221):0.0359171):0.0594469,tarSyr1:0.137467):0.011091,(micMur1:0.0918138,otoGar3:0.127231):0.0351527):0.0153171,tupBel1:0.18879):0.0042463):0.0214646,((susScr3:0.121641,(vicPac1:0.109818,(turTru2:0.0635753,(oviAri1:0.0392493,bosTau7:0.0315816):0.0939861):0.0203711):0.00368417):0.0444758,((((felCat5:0.0897448,(canFam3:0.0888602,ailMel1:0.0767935):0.021837):0.05011,equCab2:0.109367):0.00605998,(myoLuc2:0.137144,pteVam1:0.114013):0.0339604):0.00395001,(eriEur1:0.226934,sorAra1:0.270619):0.0628319):0.00292667):0.0291403):0.0231397,((((loxAfr3:0.078841,proCap1:0.160295):0.00825096,echTel1:0.266786):0.0031636,triMan1:0.0685675):0.0736043,(dasNov3:0.112086,choHof1:0.0974658):0.0535724):0.00739115):0.245967,(monDom5:0.139913,(sarHar1:0.132596,macEug2:0.111778):0.0294309):0.21273):0.0770867,ornAna1:0.50425):0.135096,(((((melGal1:0.067697,galGal4:0.05253):0.13729,taeGut1:0.202681):0.00899388,melUnd1:0.127774):0.216078,anoCar2:0.575186):0.0128221,chrPic1:0.201659):0.137011):0.113527,xenTro3:0.943162):0.0646458,latCha1:0.596956):0.463611,((((((tetNig2:0.223213,fr3:0.198755):0.263107,oreNil2:0.33649):0.0139699,gasAcu1:0.314841):0.0573697,oryLat2:0.430105):0.185668,gadMor1:0.562778):0.169352,danRer7:0.753326):0.117017):0.501088,petMar1:0.501088); # four different subset lists: paste glire.list euarchontoglires.list placental.list all.list # mm10 mm10 mm10 mm10 # rn5 rn5 rn5 rn5 # dipOrd1 dipOrd1 dipOrd1 dipOrd1 # hetGla2 hetGla2 hetGla2 hetGla2 # cavPor3 cavPor3 cavPor3 cavPor3 # speTri2 speTri2 speTri2 speTri2 # oryCun2 oryCun2 oryCun2 oryCun2 # ochPri2 ochPri2 ochPri2 ochPri2 # tupBel1 tupBel1 tupBel1 # hg19 hg19 hg19 # gorGor3 gorGor3 gorGor3 # panTro4 panTro4 panTro4 # nomLeu2 nomLeu2 nomLeu2 # ponAbe2 ponAbe2 ponAbe2 # tarSyr1 tarSyr1 tarSyr1 # rheMac3 rheMac3 rheMac3 # papHam1 papHam1 papHam1 # otoGar3 otoGar3 otoGar3 # calJac3 calJac3 calJac3 # micMur1 micMur1 micMur1 # saiBol1 saiBol1 saiBol1 # equCab2 equCab2 # vicPac1 vicPac1 # turTru2 turTru2 # susScr3 susScr3 # bosTau7 bosTau7 # oviAri1 oviAri1 # pteVam1 pteVam1 # myoLuc2 myoLuc2 # felCat5 felCat5 # canFam3 canFam3 # ailMel1 ailMel1 # eriEur1 eriEur1 # sorAra1 sorAra1 # choHof1 choHof1 # dasNov3 dasNov3 # proCap1 proCap1 # echTel1 echTel1 # triMan1 triMan1 # loxAfr3 loxAfr3 # macEug2 # sarHar1 # monDom5 # ornAna1 # galGal4 # taeGut1 # melGal1 # melUnd1 # anoCar2 # chrPic1 # xenTro3 # latCha1 # gadMor1 # gasAcu1 # fr3 # oreNil2 # tetNig2 # danRer7 # oryLat2 # petMar1 # on organisms that do not have all species in all files, the file names # need to be filtered. Using this perl script to extract from # the full mfa files, only the subset of species from the four lists: cat << '_EOF_' > filterMfa.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: filterMfa.pl <subset.list>\n"; exit 255; } my %dbList; my $file = shift; open (FH, "<$file") or die "can not read $file"; printf STDERR "using list: $file\n"; while (my $db = <FH>) { chomp $db; $dbList{$db} = 1; } close (FH); my $dirName = $file; $dirName =~ s/.list//; $dirName .= "Mfa"; my @mfaFileList = split('\n', `ls mfa/*.mfa`); for (my $i = 0; $i < scalar(@mfaFileList); ++$i) { my $file = $mfaFileList[$i]; my $chr = $file; $chr =~ s#^mfa/##; # printf STDERR "processing: %s into %s/%s\n", $file, $dirName, $chr; open (FH, "<$file") or die "can not read $file"; open (OF, ">$dirName/$chr") or die "can not write to $dirName/$chr"; my $inGroup = 0; while (my $line = <FH>) { if ($line =~ m/^> /) { chomp $line; my ($faHead, $faDbName) = split('\s+', $line); if (exists($dbList{$faDbName})) { $inGroup = 1; printf OF "> %s\n", $faDbName; } else { $inGroup = 0; } } elsif ($inGroup) { printf OF "%s", $line; } } close (FH); close (OF); } '_EOF_' # << happy emacs chmod +x filterMfa.pl mkdir glireMfa euarchontogliresMfa placentalMfa vertebrateMfa # extract each set from the full mfa files, run msa_view on # each subset and construct .nh tree for that subset for N in glire euarchontoglires placental vertebrate do ./filterMfa.pl ${N}.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ${N}.list|xargs echo`" ${N}Mfa/*.mfa \ | sed s/"> "/">"/ > 4d.${N}.mfa /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \ --no-branchlen --prune-all-but="`cat ${N}.list|xargs echo`" \ ../tree-commas.nh > tree-commas.${N}.nh done ### XXX ### MOST INTERESTING, this phyloFit operation was repeated ### to verify that the full 60 species vertebrate operation produced the ### same result as the original "all" subset. This phyloFit appears to ### produce a different result each time ? # use phyloFit to create tree model (output is phyloFit.mod) for N in glire euarchontoglires placental vertebrate do time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ./tree-commas.${N}.nh 4d.${N}.mfa mv phyloFit.mod ${N}.mod grep TREE ${N}.mod | sed 's/TREE\:\ //' > ${N}.Nway.nh done # real 0m15.747s # real 4m5.526s # real 20m45.982s # real 141m21.248s ####################################################################### # phastCons 60-way (DONE - 2012-06-12, 2012-08-21 - Hiram) # was unable to split the full chrom MAF files, now working on the # maf files as they were split up during multiz # split 60way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh encodek mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/ss mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cd /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 64 of 66 jobs # Crashed: 2 jobs # CPU time in finished jobs: 347730s 5795.49m 96.59h 4.02d 0.011 y # IO & Wait Time: 102813s 1713.56m 28.56h 1.19d 0.003 y # Average job time: 7040s 117.33m 1.96h 0.08d # Longest finished job: 42666s 711.10m 11.85h 0.49d # Submission to last job: 150336s 2505.60m 41.76h 1.74d # finish the last two on hgwdev with more memory. # linux data memory, in 1024-byte units export M=188743680 ulimit -S -m $M -v $M ./doSplit.csh chr1 chr1.done & ./doSplit.csh chr2 chr2.done wait # real 864m53.235s # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons cd /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all glire glirePrimate glirePrimatePlacental cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list # Create parasol batch and run it ############################ run for all species cd /hive/data/genomes/mm10/bed/multiz60way/cons mkdir all cd all cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 36286s 604.77m 10.08h 0.42d 0.001 y # IO & Wait Time: 10101s 168.35m 2.81h 0.12d 0.000 y # Average job time: 148s 2.46m 0.04h 0.00d # Longest finished job: 219s 3.65m 0.06h 0.00d # Submission to last job: 4383s 73.05m 1.22h 0.05d # create Most Conserved track cd /hive/data/genomes/mm10/bed/multiz60way/cons/all cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # -rw-rw-r-- 1 230642249 Jun 15 11:48 tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # -rw-rw-r-- 1 236425914 Jun 15 11:52 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgLoadBed mm10 phastConsElements60way mostConserved.bed # Read 6748481 elements of size 5 from mostConserved.bed # real 2m20.950s # Try for 5% overall cov, and 70% CDS cov featureBits mm10 -enrichment refGene:cds phastConsElements60way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.281%, phastConsElements60way 6.517%, # both 0.913%, cover 71.29%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60way # ensGene:cds 1.357%, phastConsElements60way 6.517%, both 0.942%, cover # 69.39%, enrich 10.65x # real 0m54.109s time featureBits mm10 -enrichment knownGene:cds phastConsElements60way # knownGene:cds 1.325%, phastConsElements60way 6.517%, both 0.930%, # cover 70.18%, enrich 10.77x # real 0m50.472s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/mm10/bed/multiz60way/cons/all mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.wigFix.gz done # real 102m58.496s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons60way.wig phastCons60way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m32.980s du -hsc *.wi? # 1.8G phastCons60way.wib # 298M phastCons60way.wig # 2.1G total # encode into a bigWig file: # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: sizeG=188743680 export sizeG ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60way.bw) # real 27m1.039s # -rw-rw-r-- 1 4671685725 Jun 18 10:24 phastCons60way.bw bigWigInfo phastCons60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,510,917 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149660 min: 0.000000 max: 1.000000 std: 0.282516 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file mkdir /gbdb/mm10/bbi ln -s `pwd`/phastCons60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60way; \ create table phastCons60way (fileName varchar(255) not null); \ insert into phastCons60way values ("/gbdb/mm10/bbi/phastCons60way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all ln -s `pwd`/phastCons60way.wib /gbdb/mm10/multiz60way/phastCons60way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60way phastCons60way.wig # real 0m54.546s wigTableStats.sh mm10 phastCons60way # db.table min max mean count sumData # mm10.phastCons60way 0 1 0.14966 1929686275 2.88797e+08 # stdDev viewLimits # 0.282516 viewLimits=0:1 # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgWiggle -doHistogram -db=mm10 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons60way > histogram.data 2>&1 # real 7m37.212s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60way track" set xlabel " phastCons60way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Glires # setup glire-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/glire cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # glire-only: get the glire only tree from the 4d directory cp -p ../../4d/glire.mod ./glire.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/glire.list > glire.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 glire.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > glire.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 12411s 206.85m 3.45h 0.14d 0.000 y # IO & Wait Time: 117850s 1964.16m 32.74h 1.36d 0.004 y # Average job time: 415s 6.91m 0.12h 0.00d # Longest finished job: 658s 10.97m 0.18h 0.01d # Submission to last job: 796s 13.27m 0.22h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire time nice -n +19 hgLoadBed mm10 phastConsElements60wayGlire \ mostConserved.bed # Loaded 1336504 elements of size 6 # real 0m13.672s # verify coverage time featureBits mm10 phastConsElements60wayGlire # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m15.041s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayGlire # refGene:cds 1.282%, phastConsElements60wayGlire 4.413%, # both 0.944%, cover 73.60%, enrich 16.68x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayGlire # knownGene:cds 1.325%, phastConsElements60wayGlire 4.413%, # both 0.957%, cover 72.22%, enrich 16.37x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayGlire.wig phastCons60wayGlire.wib) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 10m26.712s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin ../../../../chrom.sizes \ phastCons60wayGlire.bw > bigWig.log 2>&1) & # real 52m17.108s grep VmPeak bigWig.log # pid=5552: VmPeak: 20926360 kB bigWigInfo phastCons60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,631,413,425 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.142675 min: 0.000000 max: 1.000000 std: 0.252347 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayGlire; \ create table phastCons60wayGlire \ (fileName varchar(255) not null); \ insert into phastCons60wayGlire values ("/gbdb/mm10/bbi/phastCons60wayGlire.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire ln -s `pwd`/phastCons60wayGlire.wib \ /gbdb/mm10/multiz60way/phastCons60wayGlire.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayGlire phastCons60wayGlire.wig # real 0m56.786s wigTableStats.sh mm10 phastCons60wayGlire # db.table min max mean count sumData mm10.phastCons60wayGlire 0 1 0.142675 1929686275 2.75318e+08 # stdDev viewLimits # 0.252347 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayGlire > histogram.data 2>&1 # real 4m28.743s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Hg19 Histogram phastCons60wayGlire track" set xlabel " phastCons60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Euarchontoglires # setup euarchontoglires-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # euarchontoglires-only: get the euarchontoglires only tree from the 4d directory cp -p ../../4d/euarchontoglires.mod ./euarchontoglires.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/euarchontoglires.list > euarchontoglires.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 euarchontoglires.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > euarchontoglires.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 17421s 290.36m 4.84h 0.20d 0.001 y # IO & Wait Time: 37430s 623.83m 10.40h 0.43d 0.001 y # Average job time: 175s 2.91m 0.05h 0.00d # Longest finished job: 343s 5.72m 0.10h 0.00d # Submission to last job: 2403s 40.05m 0.67h 0.03d cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires time nice -n +19 hgLoadBed mm10 phastConsElements60wayEuarchontoGlires \ mostConserved.bed # Loaded 2327130 elements of size 6 # real 0m24.591s # verify coverage time featureBits mm10 phastConsElements60wayEuarchontoGlires # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m18.857s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayEuarchontoGlires # refGene:cds 1.282%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.929%, cover 72.46%, enrich 15.12x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayEuarchontoGlires # knownGene:cds 1.325%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.943%, cover 71.16%, enrich 14.85x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayEuarchontoGlires.wig phastCons60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m49.080s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayEuarchontoGlires.bw \ > bigWig.log 2>&1 ) & # real 26m0.111s bigWigInfo phastCons60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,411,704,465 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.133253 min: 0.000000 max: 1.000000 std: 0.256320 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayEuarchontoGlires; \ create table phastCons60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phastCons60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phastCons60wayEuarchontoGlires.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires ln -s `pwd`/phastCons60wayEuarchontoGlires.wib \ /gbdb/mm10/multiz60way/phastCons60wayEuarchontoGlires.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayEuarchontoGlires phastCons60wayEuarchontoGlires.wig # real 0m50.676s time wigTableStats.sh mm10 phastCons60wayEuarchontoGlires # db.table min max mean count sumData mm10.phastCons60wayEuarchontoGlires 0 1 0.133253 1929686275 2.57137e+08 # stdDev viewLimits # 0.25632 viewLimits=0:1 # real 0m21.964s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayEuarchontoGlires > histogram.data 2>&1 # real 3m31.112s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayEuarchontoGlires track" set xlabel " phastCons60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for primate ***### This was constructed ### and examined, but not used in the release # setup primate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/primate cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # primate-only: get the primate only tree from the 4d directory cp -p ../../4d/primate.mod ./primate.mod # and all the others become the non-informative list for phastCons to ignore cat ../../4d/glire.list ../../4d/placental.list ../../4d/vertebrate.list \ | grep -v mm10 | sort | xargs echo | sed -e "s/ /,/g" \ > primate.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 13884s 231.39m 3.86h 0.16d 0.000 y # IO & Wait Time: 130791s 2179.86m 36.33h 1.51d 0.004 y # Average job time: 461s 7.68m 0.13h 0.01d # Longest finished job: 741s 12.35m 0.21h 0.01d # Submission to last job: 910s 15.17m 0.25h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m27.199s featureBits mm10 mostConserved.bed # 112908553 bases of 2652783500 (4.256%) in intersection # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate time nice -n +19 hgLoadBed mm10 phastConsElements60wayPrimate \ mostConserved.bed # Loaded 1119924 elements of size 6 # real 0m17.423s # verify coverage featureBits mm10 phastConsElements60wayPrimate # 112908553 bases of 2652783500 (4.256%) in intersection # real 0m13.684s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayPrimate # refGene:cds 1.281%, phastConsElements60wayPrimate 4.256%, # both 0.897%, cover 69.98%, enrich 16.44x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPrimate # knownGene:cds 1.325%, phastConsElements60wayPrimate 4.256%, # both 0.909%, cover 68.64%, enrich 16.13x featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPrimate # ensGene:cds 1.357%, phastConsElements60wayPrimate 4.256%, both 0.913%, # cover 67.30%, enrich 15.81x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz done # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPrimate.wig phastCons60wayPrimate.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 12m22.465s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayPrimate.bw # real 31m44.517s bigWigInfo phastCons60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,431,379,060 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.093847 min: 0.000000 max: 1.000000 std: 0.233892 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPrimate; \ create table phastCons60wayPrimate \ (fileName varchar(255) not null); \ insert into phastCons60wayPrimate values ("/gbdb/mm10/bbi/phastCons60wayPrimate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate ln -s `pwd`/phastCons60wayPrimate.wib \ /gbdb/mm10/multiz60way/phastCons60wayPrimate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPrimate phastCons60wayPrimate.wig # real 1m24.188s wigTableStats.sh mm10 phastCons60wayPrimate # db.table min max mean count sumData # mm10.phastCons60wayPrimate 0 1 0.0938475 1929686275 1.81096e+08 # 0.233892 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPrimate > histogram.data 2>&1 # real 7m3.198s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPrimate track" set xlabel " phastCons60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Placental # setup placental-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/placental cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # placental-only: get the placental only tree from the 4d directory cp -p ../../4d/placental.mod ./placental.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/placental.list > placental.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 placental.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > placental.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 27853s 464.21m 7.74h 0.32d 0.001 y # IO & Wait Time: 128981s 2149.69m 35.83h 1.49d 0.004 y # Average job time: 499s 8.32m 0.14h 0.01d # Longest finished job: 785s 13.08m 0.22h 0.01d # Submission to last job: 5970s 99.50m 1.66h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m44.170s featureBits mm10 mostConserved.bed # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m54.927s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental time nice -n +19 hgLoadBed mm10 phastConsElements60wayPlacental \ mostConserved.bed # Loaded 5257437 elements of size 6 # real 0m56.788s # verify coverage, should be the same as the file measured above time featureBits mm10 phastConsElements60wayPlacental # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m39.537s # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits mm10 -enrichment refGene:cds phastConsElements60wayPlacental # refGene:cds 1.282%, phastConsElements60wayPlacental 5.430%, # both 0.920%, cover 71.73%, enrich 13.21x # real 0m39.833s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPlacental # knownGene:cds 1.325%, phastConsElements60wayPlacental 5.430%, # both 0.934%, cover 70.47%, enrich 12.98x # real 0m44.567s time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPlacental # ensGene:cds 1.357%, phastConsElements60wayPlacental 5.430%, # both 0.941%, cover 69.32%, enrich 12.77x # real 0m43.093s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPlacental.wig \ phastCons60wayPlacental.wib > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.237s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayPlacental.bw > bigWig.log 2>&1) & # real 25m18.556s bigWigInfo phastCons60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,271,676,156 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.135703 min: 0.000000 max: 1.000000 std: 0.266432 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPlacental; \ create table phastCons60wayPlacental \ (fileName varchar(255) not null); \ insert into phastCons60wayPlacental values ("/gbdb/mm10/bbi/phastCons60wayPlacental.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental ln -s `pwd`/phastCons60wayPlacental.wib \ /gbdb/mm10/multiz60way/phastCons60wayPlacental.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPlacental phastCons60wayPlacental.wig # real 0m41.999s time wigTableStats.sh mm10 phastCons60wayPlacental # db.table min max mean count sumData # mm10.phastCons60wayPlacental 0 1 0.135703 1929686275 2.61864e+08 # stdDev viewLimits # 0.266432 # viewLimits=0:1 # real 0m21.723s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPlacental > histogram.data 2>&1 # real 2m39.659s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPlacental track" set xlabel " phastCons60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Vertebrate # setup vertebrate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # vertebrate-only: get the vertebrate only tree from the 4d directory cp -p ../../4d/vertebrate.mod ./vertebrate.mod # they are all in this one, no need for non-informative list gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 313 of 314 jobs # Crashed: 1 jobs # CPU time in finished jobs: 36058s 600.97m 10.02h 0.42d 0.001 y # IO & Wait Time: 125496s 2091.59m 34.86h 1.45d 0.004 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 912s 15.20m 0.25h 0.01d # Submission to last job: 2681s 44.68m 0.74h 0.03d # the one failed job was completed manually on hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed time featureBits mm10 mostConserved.bed # 172842314 bases of 2652783500 (6.516%) in intersection # real 1m23.298s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate time nice -n +19 hgLoadBed mm10 phastConsElements60wayVertebrate \ mostConserved.bed # Read 6747163 elements of size 5 from mostConserved.bed # real 1m15.122s # verify coverage featureBits mm10 phastConsElements60wayVertebrate # 172842314 bases of 2652783500 (6.516%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayVertebrate # refGene:cds 1.282%, phastConsElements60wayVertebrate 6.516%, # both 0.914%, cover 71.26%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayVertebrate # ensGene:cds 1.357%, phastConsElements60wayVertebrate 6.516%, # both 0.942%, cover 69.39%, enrich 10.65x # real 0m51.139s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayVertebrate # knownGene:cds 1.325%, phastConsElements60wayVertebrate 6.516%, # both 0.930%, cover 70.18%, enrich 10.77x # real 0m51.545s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayVertebrate.wig \ phastCons60wayVertebrate.wib > wigEncode.log 2>&1 ) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.554s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayVertebrate.bw > bigWig.log 2>&1) & # real 25m8.630s bigWigInfo phastCons60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,348,984 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149646 min: 0.000000 max: 1.000000 std: 0.282502 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayVertebrate; \ create table phastCons60wayVertebrate \ (fileName varchar(255) not null); \ insert into phastCons60wayVertebrate values ("/gbdb/mm10/bbi/phastCons60wayVertebrate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate ln -s `pwd`/phastCons60wayVertebrate.wib \ /gbdb/mm10/multiz60way/phastCons60wayVertebrate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayVertebrate phastCons60wayVertebrate.wig # real 0m45.432s time wigTableStats.sh mm10 phastCons60wayVertebrate # db.table min max mean count sumData # mm10.phastCons60wayVertebrate 0 1 0.149646 1929686275 2.8877e+08 # stdDev viewLimits # 0.282502 viewLimits=0:1 # real 0m22.224s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayVertebrate > histogram.data 2>&1 # real 2m52.041s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayVertebrate track" set xlabel " phastCons60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # phyloP conservation for 60-way (DONE - 2012-06-15 - 2012-08-21 - Hiram) # # Vertebrate, Glire, Primate, Placental # # split SS files into 1M chunks, this business needs smaller files # to complete # many of these jobs run too much memory to finish on a kluster node # can run all of this on hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP mkdir ss run.split cd run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list cat << '_EOF_' > template #LOOP ./doSplit.csh $(root1) $(root1).done #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList # copy the jobList to runEm.sh, edit to make all the commands run in # the background, with wait statements every few commands to run # a small number of these at once, no more than four at once with # the large chroms, the small randoms can run a bunch at once, they # finish quickly. time ./runEm.sh # about 11h30m # run phyloP with score=LRT ssh swarm cd /cluster/data/mm10/bed/multiz60way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.525 > all.mod grep BACKGROUND ../../cons/glire/glire.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.531 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/glire/glire.mod 0.531 > glire.mod grep BACKGROUND ../../cons/primate/primate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.509 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/primate/primate.mod 0.509 > primate.mod grep BACKGROUND ../../cons/euarchontoglires/euarchontoglires.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.518 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/euarchontoglires/euarchontoglires.mod 0.518 \ > euarchontoglires.mod grep BACKGROUND ../../cons/placental/placental.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/placental/placental.mod 0.525 > placental.mod grep BACKGROUND ../../cons/vertebrate/vertebrate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/vertebrate/vertebrate.mod 0.525 > vertebrate.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fex set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set ssFile = $1:t echo "ssFile: $ssFile" set out = $2 set cName = $f:h echo "cName: $cName" set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $ssFile.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$ssFile.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2708 of 2708 jobs # CPU time in finished jobs: 1832980s 30549.67m 509.16h 21.22d 0.058 y # IO & Wait Time: 217434s 3623.90m 60.40h 2.52d 0.007 y # Average job time: 757s 12.62m 0.21h 0.01d # Longest finished job: 1458s 24.30m 0.41h 0.02d # Submission to last job: 3647s 60.78m 1.01h 0.04d # missed chrM in the original run: ../run.phyloP/doPhyloP.csh chrM/chrM.1-16296 wigFix/chrM/chrM.1-16296.wigFix ssh hgwdev cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/all mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP60way.wigFix.gz done # real 38m15.538s zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP60way.wig phyloP60way.wib > wigEncode.log 2>&1 & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 27m53.384s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60way.bw) # real 30m10.440s bigWigInfo phyloP60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,533,501,426 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169761 min: -20.000000 max: 7.532000 std: 0.942744 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayAll; \ create table phyloP60wayAll \ (fileName varchar(255) not null); \ insert into phyloP60wayAll values ("/gbdb/mm10/bbi/phyloP60way.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60way.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayAll phyloP60way.wig # real 1m16.934s wigTableStats.sh mm10 phyloP60wayAll # db.table min max mean count sumData # mm10.phyloP60wayAll -20 7.532 0.169761 1929686275 3.27586e+08 # stdDev viewLimits # 0.942744 viewLimits=-4.54396:4.88348 # that range is: 4.54396+4.88348 = 9.42744 for -hBinSize=0.0942744 below # to get 1,000 bins # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.0942744 -hBinCount=1000 -hMinVal=-4.54396 -verbose=2 \ -db=mm10 phyloP60wayAll > histogram.data 2>&1 # real real 5m58.309s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60way track, all 60 vertebrates" set xlabel " phyloP60way score, all 60 vertebrates" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.2] set xrange [-2:2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the glire ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 206723s 3445.39m 57.42h 2.39d 0.007 y # IO & Wait Time: 256366s 4272.76m 71.21h 2.97d 0.008 y # Average job time: 171s 2.85m 0.05h 0.00d # Longest finished job: 487s 8.12m 0.14h 0.01d # Submission to last job: 1926s 32.10m 0.54h 0.02d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayGlire.wig phyloP60wayGlire.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.17, lower limit -4.35 # real 20m31.753s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayGlire.bw) & # real 37m9.063s bigWigInfo phyloP60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,158,091,915 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.073187 min: -4.346000 max: 1.165000 std: 0.602992 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayGlire; \ create table phyloP60wayGlire \ (fileName varchar(255) not null); \ insert into phyloP60wayGlire values ("/gbdb/mm10/bbi/phyloP60wayGlire.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayGlire.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayGlire phyloP60wayGlire.wig # real 0m58.536s wigTableStats.sh mm10 phyloP60wayGlire # db.table min max mean count # mm10.phyloP60wayGlire -4.346 1.165 0.0731873 1929686275 1.41229e+08 # stdDev viewLimits # 0.602992 viewLimits=-2.94177:1.165 # that range is: 4.346+1.165 = 5.511 -> hBinSize=0.005511 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.005511 -hBinCount=1000 -hMinVal=-4.346 -verbose=2 \ -db=mm10 phyloP60wayGlire > histogram.data 2>&1 # real 8m23.088s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayGlire track" set xlabel " phyloP60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ################### Running the euarchontoglires ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 542547s 9042.45m 150.71h 6.28d 0.017 y # IO & Wait Time: 75914s 1265.23m 21.09h 0.88d 0.002 y # Average job time: 228s 3.80m 0.06h 0.00d # Longest finished job: 430s 7.17m 0.12h 0.00d # Submission to last job: 4149s 69.15m 1.15h 0.05d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayEuarchontoGlires.wig phyloP60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.75, lower limit -12.70 # real 10m52.064s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayEuarchontoGlires.bw) & # real 26m47.912s bigWigInfo phyloP60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,970,501,521 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.078739 min: -12.704000 max: 1.753000 std: 0.689759 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayEuarchontoGlires; \ create table phyloP60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phyloP60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phyloP60wayEuarchontoGlires.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayEuarchontoGlires.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayEuarchontoGlires phyloP60wayEuarchontoGlires.wig # real 0m51.777s time wigTableStats.sh mm10 phyloP60wayEuarchontoGlires # db.table min max mean count # mm10.phyloP60wayEuarchontoGlires -12.704 1.753 0.0787387 1929686275 # sumData stdDev viewLimits # 1.51941e+08 0.689759 viewLimits=-3.37006:1.753 # real 0m26.197s # that range is: 12.704+1.753 = 14.457 -> hBinSize=0.014457 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.014457 -hBinCount=1000 -hMinVal=-12.704 -verbose=2 \ -db=mm10 phyloP60wayEuarchontoGlires > histogram.data 2>&1 # real 3m22.205s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayEuarchontoGlires track" set xlabel " phyloP60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the primate ####################### ### ***### This was constructed ### and examined, but not used in the release mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 307901s 5131.68m 85.53h 3.56d 0.010 y # IO & Wait Time: 42937s 715.62m 11.93h 0.50d 0.001 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest finished job: 234s 3.90m 0.07h 0.00d # Submission to last job: 5975s 99.58m 1.66h 0.07d cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/primate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPrimate.wig phyloP60wayPrimate.wib \ > wigEncode.log 2>&1) & # real 9m37.055s # Converted stdin, upper limit 0.93, lower limit -10.63 export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPrimate.bw) & # real 24m18.842s bigWigInfo phyloP60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,715,332,211 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.060017 min: -10.633000 max: 0.930000 std: 0.518027 # loading bigWig table: ln -s `pwd`/phyloP60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPrimate; \ create table phyloP60wayPrimate \ (fileName varchar(255) not null); \ insert into phyloP60wayPrimate values ("/gbdb/mm10/bbi/phyloP60wayPrimate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPrimate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPrimate phyloP60wayPrimate.wig # real 0m45.837s wigTableStats.sh mm10 phyloP60wayPrimate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayPrimate -10.633 0.93 0.0600168 1929686275 1.15814e+08 # stdDev viewLimits # 0.518027 viewLimits=-2.53012:0.93 # that range is: 10.633+0.93 = 11.563 for the hBinSize=0.11563 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.11563 -hBinCount=1000 -hMinVal=-10.633 -verbose=2 \ -db=mm10 phyloP60wayPrimate > histogram.data 2>&1 # real 4m36.379s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayPrimate track" set xlabel " phyloP60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.472] set xrange [-2.5:1.0] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the placental ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1188036s 19800.60m 330.01h 13.75d 0.038 y # IO & Wait Time: 209859s 3497.65m 58.29h 2.43d 0.007 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 6336s 105.60m 1.76h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPlacental.wig phyloP60wayPlacental.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 3.30, lower limit -20.00 # real 11m54.289s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPlacental.bw \ > bigWig.log 2>&1) & # real 28m4.576s bigWigInfo phyloP60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,423,832,009 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.109489 min: -20.000000 max: 3.296000 std: 0.810657 # loading bigWig table if that is what you wanted to do: ln -s `pwd`/phyloP60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPlacental; \ create table phyloP60wayPlacental \ (fileName varchar(255) not null); \ insert into phyloP60wayPlacental values ("/gbdb/mm10/bbi/phyloP60wayPlacental.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPlacental.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPlacental phyloP60wayPlacental.wig # real 0m50.284s wigTableStats.sh mm10 phyloP60wayPlacental # db.table min max mean count sumData # mm10.phyloP60wayPlacental -20 3.296 0.109489 1929686275 2.11279e+08 # stdDev viewLimits # 0.810657 viewLimits=-3.9438:3.296 # that range is: 20+3.296 = 23.296 for hBinSize=0.023296 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.023296 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayPlacental > histogram.data 2>&1 # real 3m24.650s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayPlacental track" set xlabel " phyloP60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.084] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the vertebrate ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1825414s 30423.56m 507.06h 21.13d 0.058 y # IO & Wait Time: 211040s 3517.34m 58.62h 2.44d 0.007 y # Average job time: 752s 12.53m 0.21h 0.01d # Longest finished job: 1530s 25.50m 0.42h 0.02d # Submission to last job: 6045s 100.75m 1.68h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayVertebrate.wig phyloP60wayVertebrate.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 12m2.774s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayVertebrate.bw \ > bigWig.log 2>&1) & # real 27m6.791s bigWigInfo phyloP60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,529,467,614 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169653 min: -20.000000 max: 7.532000 std: 0.942808 # loading bigWig table: ln -s `pwd`/phyloP60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayVertebrate; \ create table phyloP60wayVertebrate \ (fileName varchar(255) not null); \ insert into phyloP60wayVertebrate values ("/gbdb/mm10/bbi/phyloP60wayVertebrate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayVertebrate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayVertebrate phyloP60wayVertebrate.wig # real 0m56.535s time wigTableStats.sh mm10 phyloP60wayVertebrate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayVertebrate -20 7.532 0.169653 1929686275 3.27377e+08 # stdDev viewLimits # 0.942808 viewLimits=-4.54439:4.88369 # real 0m25.320s # that range is: 20+7.532 = 27.532 for hBinSize=0.027532 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.027532 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayVertebrate > histogram.data 2>&1 # real 3m26.565s # to see yrange: egrep -v "^#|udcfileOpen" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayVertebrate track" set xlabel " phyloP60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.1123] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # construct download files for 60-way (DONE - 2012-06-27 - 2012-08-21 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/alignments mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way mkdir /hive/data/genomes/mm10/bed/multiz60way/downloads cd /hive/data/genomes/mm10/bed/multiz60way/downloads mkdir multiz60way phastCons60way phyloP60way cd multiz60way mkdir maf alignments cd maf time cp -p ../../../anno/result/chr*.maf . # real 735m35.723s time gzip *.maf # real 700m23.340s md5sum *.maf.gz > md5sum.txt ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf cd .. du -hsc maf # 24G maf du -hsc ../../anno/result/ # 244G ../../anno/result/ ln -s ../../mm10.60way.nh . ln -s ../../mm10.60way.commonNames.nh . ln -s `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phastCons60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phastCons cd glire ln -s ../../../cons/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire # real 5m50.001s cd ../euarchontoglire ln -s ../../../cons/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & # real 1m14.103s ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire cd ../primate ln -s ../../../cons/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate # real 5m39.288s cd ../placental ln -s ../../../cons/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental # real 5m9.762s cd ../vertebrate ln -s ../../../cons/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate # real 0m45.408s cd ../mm10.60way.phastCons ln -s ../../../cons/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons # real 6m11.158s cd .. ln -s ../../cons/all/all.mod mm10.60way.phastCons.mod ln -s ../../cons/glire/glire.mod mm10.60way.phastCons.glire.mod ln -s ../../cons/primate/primate.mod mm10.60way.phastCons.primate.mod ln -s ../../cons/euarchontoglires/euarchontoglires.mod mm10.60way.phastCons.euarchontoglire.mod ln -s ../../cons/placental/placental.mod mm10.60way.phastCons.placental.mod ln -s ../../cons/vertebrate/vertebrate.mod mm10.60way.phastCons.vertebrate.mod ln -s ../../cons/all/phastCons60way.bw mm10.60way.phastCons.bw ln -s ../../cons/glire/phastCons60wayGlire.bw \ mm10.60way.phastCons60wayGlire.bw ln -s ../../cons/placental/phastCons60wayPlacental.bw \ mm10.60way.phastCons60wayPlacental.bw ln -s ../../cons/euarchontoglires/phastCons60wayEuarchontoGlires.bw \ mm10.60way.phastCons60wayEuarchontoGlire.bw ln -s ../../cons/primate/phastCons60wayPrimate.bw \ mm10.60way.phastCons60wayPrimate.bw ln -s ../../cons/vertebrate/phastCons60wayVertebrate.bw \ mm10.60way.phastCons60wayVertebrate.bw time md5sum *.mod *.bw > md5sum.txt # real 20m11.260s # obtain the README.txt from hg19/phastCons46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phyloP60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phyloP60way cd glire ln -s ../../../consPhyloP/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire # real 6m5.733s cd ../euarchontoglire ln -s ../../../consPhyloP/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire # real 5m40.272s cd ../primate ln -s ../../../consPhyloP/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate # real 7m22.623s cd ../placental ln -s ../../../consPhyloP/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental # real 7m39.269s cd ../vertebrate ln -s ../../../consPhyloP/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate cd ../mm10.60way.phyloP60way ln -s ../../../consPhyloP/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way # real 8m5.777s cd .. ln -s ../../consPhyloP/run.phyloP/all.mod mm10.60way.phyloP60way.mod ln -s ../../consPhyloP/run.phyloP/glire.mod ./mm10.phyloP.glire.mod ln -s ../../consPhyloP/run.phyloP/placental.mod ./mm10.phyloP.placental.mod ln -s ../../consPhyloP/run.phyloP/euarchontoglires.mod ./mm10.phyloP.euarchontoglire.mod ln -s ../../consPhyloP/run.phyloP/primate.mod ./mm10.phyloP.primate.mod ln -s ../../consPhyloP/run.phyloP/vertebrate.mod ./mm10.60way.vertebrate.mod ln -s ../../consPhyloP/all/phyloP60way.bw mm10.60way.phyloP60way.bw ln -s ../../consPhyloP/glire/phyloP60wayGlire.bw \ mm10.60way.phyloP60wayGlire.bw ln -s ../../consPhyloP/vertebrate/phyloP60wayVertebrate.bw \ mm10.60way.phyloP60wayVertebrate.bw ln -s ../../consPhyloP/placental/phyloP60wayPlacental.bw \ mm10.60way.phyloP60wayPlacental.bw ln -s ../../consPhyloP/euarchontoglires/phyloP60wayEuarchontoGlires.bw \ mm10.60way.phyloP60wayEuarchontoglire.bw ln -s ../../consPhyloP/primate/phyloP60wayPrimate.bw \ mm10.60way.phyloP60wayPrimate.bw time md5sum *.mod *.bw > md5sum.txt & # real 20m17.082s # obtain the README.txt from hg19/phyloP46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/md5sum.txt `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm10/bed/multiz60way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm10 refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm10 multiz60way \ stdin stdout \ -orgs=/hive/data/genomes/mm10/bed/multiz60way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # real 199m45.558s md5sum *.nh *.maf.gz > md5sum.txt # real 27m59.778s # obtain the README.txt from hg19/multiz46way and update for this # situation ln -s `pwd`/*.nh `pwd`/*.maf.gz `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ############################################################################# # hgPal downloads (DONE - 2012-07-05 - 2012-07-09 - Hiram) # FASTA from 60-way for refGene ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal cd /hive/data/genomes/mm10/bed/multiz60way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz60way export gp=refGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & # real 93m34.376s mz=multiz60way gp=refGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m16.821s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=refGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ ######################################################################### # lastz nile tilapia oreNil2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil2 mkdir /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cd /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil2 SEQ2_DIR=/hive/data/genomes/oreNil2/oreNil2.2bit SEQ2_LEN=/hive/data/genomes/oreNil2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 108m51.232s cat fb.mm10.chainOreNil2Link.txt # 51909908 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil2.2012-04-11 lastz.oreNil2 # and for the swap mkdir /hive/data/genomes/oreNil2/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 9m8.213s cat fb.oreNil2.chainMm10Link.txt # 49704887 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ pig susScr3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr3 mkdir /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr3 SEQ2_DIR=/hive/data/genomes/susScr3/susScr3.2bit SEQ2_LEN=/hive/data/genomes/susScr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1086m29.992s cat fb.mm10.chainSusScr3Link.txt # 681359766 bases of 2652783500 (25.685%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr3.2012-04-13 lastz.susScr3 mkdir /hive/data/genomes/susScr3/bed/blastz.mm10.swap cd /hive/data/genomes/susScr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 104m56.258s cat fb.susScr3.chainMm10Link.txt # 743574150 bases of 2525294057 (29.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov3 mkdir /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov3 SEQ2_DIR=/hive/data/genomes/dasNov3/dasNov3.2bit SEQ2_LEN=/hive/data/genomes/dasNov3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1125m34.124s cat fb.mm10.chainDasNov3Link.txt # 668529920 bases of 2652783500 (25.201%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov3.2012-04-13 lastz.dasNov3 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 time doRecipBest.pl mm10 dasNov3 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 116m51.114s mkdir /hive/data/genomes/dasNov3/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 150m51.653s cat fb.dasNov3.chainMm10Link.txt # 695161920 bases of 3299882059 (21.066%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat5 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat5 mkdir /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat5 SEQ2_DIR=/hive/data/genomes/felCat5/felCat5.2bit SEQ2_LEN=/hive/data/genomes/felCat5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1029m54.494s cat fb.mm10.chainFelCat5Link.txt # 788544084 bases of 2652783500 (29.725%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat5.2012-04-13 lastz.felCat5 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 time doRecipBest.pl mm10 felCat5 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 106m30.011s mkdir /hive/data/genomes/felCat5/bed/blastz.mm10.swap cd /hive/data/genomes/felCat5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m25.850s cat fb.felCat5.chainMm10Link.txt # 762344436 bases of 2364296207 (32.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat5/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ naked mole rat hetGla2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job screen -S mm10HetGla2 mkdir /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # naked mole rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: naked mole rat HetGla2 SEQ2_DIR=/hive/data/genomes/hetGla2/hetGla2.2bit SEQ2_LEN=/hive/data/genomes/hetGla2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 690m7.626s cat fb.mm10.chainHetGla2Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla2.2012-04-14 lastz.hetGla2 mkdir /hive/data/genomes/hetGla2/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.hetGla2.chainMm10Link.txt # 879356778 bases of 2314771103 (37.989%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru2 mkdir /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru2 SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 624m36.508s cat fb.mm10.chainTurTru2Link.txt # 802921354 bases of 2652783500 (30.267%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru2.2012-04-14 lastz.turTru2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 time doRecipBest.pl mm10 turTru2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 44m47.753s mkdir /hive/data/genomes/turTru2/bed/blastz.mm10.swap cd /hive/data/genomes/turTru2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m17.088s cat fb.turTru2.chainMm10Link.txt # 781169007 bases of 2332402443 (33.492%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu2 (DONE - 2012-04-14 - Hiram) screen -S mm10NomLeu2 mkdir /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu2 SEQ2_DIR=/hive/data/genomes/nomLeu2/nomLeu2.2bit SEQ2_LEN=/hive/data/genomes/nomLeu2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 621m38.251s cat fb.mm10.chainNomLeu2Link.txt # 902774780 bases of 2652783500 (34.031%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu2.2012-04-14 lastz.nomLeu2 mkdir /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.nomLeu2.chainMm10Link.txt # 889660339 bases of 2756609047 (32.274%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu2/bed ln -s blastz.mm10.swap lastz.mm10 ##################################################################### # tRNAs track (DONE 2012-04-02 Chin) # # Please refer to the generic tRNS track build documentation # ~/kent/src/hg/makeDb/doc/tRNAsTrack.txt # for details about how the track was build. ############################################################################## # orfeome 2012-03-16 (markd) enabled ORFeome tracks in etc/genbank.conf and reload genbank ############################################################################ # construct liftOver to mm9 (DONE - 2012-04-30 - Hiram) screen -S 10 # manage this longish running job in a screen mkdir /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 cd /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # real 95m21.635s # verify this file exists: og -L /gbdb/mm10/liftOver/mm10ToMm9.over.chain.gz # -rw-rw-r-- 1 535855 Feb 9 12:07 /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz # and try out the conversion on genome-test from mm9 to mm10 ############################################################################ # EXONIPHY MM10, lifted from hg19 (DONE - braney 2012-05-29) # needed for ucscGenes building # create a syntenic liftOver chain file cd /cluster/data/hg19/bed/lastz.mm10/axtChain time nice -n +19 netFilter -syn hg19.mm10.net.gz \ | netChainSubset -verbose=0 stdin hg19.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19.mm10.syn.chain.gz #real 2m38.915s #user 3m29.458s #sys 0m16.033s # slightly smaller than the ordinary liftOver chain file: -rw-rw-r-- 1 78419424 Mar 7 18:40 hg19.mm10.over.chain.gz -rw-rw-r-- 1 74588027 May 29 12:29 hg19.mm10.syn.chain.gz # exoniphyMm9.gp is prepared as follows mkdir /cluster/data/mm10/bed/exoniphy cd /cluster/data/mm10/bed/exoniphy hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp time nice -n +19 liftOver -genePred exoniphyHg19.gp \ /cluster/data/hg19/bed/lastz.mm10/axtChain/hg19.mm10.syn.chain.gz \ exoniphyMm10.gp unmapped # real 16m0.334s # user 15m46.462s # sys 0m7.115s wc -l * # 186601 exoniphyHg19.gp # 178821 exoniphyMm10.gp # 15560 unmapped cd /cluster/data/mm10/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt mm10 exoniphy exoniphyMm10.gp nice -n +19 featureBits mm10 exoniphy # 26795543 bases of 2652783500 (1.010%) in intersection nice -n +19 featureBits mm9 exoniphy # 25931742 bases of 2620346127 (0.990%) in intersection ############################################################################## # LASTZ cow bosTau6 (DONE - 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S mm10BosTau6 mkdir /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 cd /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 212m21.604s cat fb.mm10.chainBosTau6Link.txt # 700039696 bases of 2652783500 (26.389%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau6.2012-06-19 lastz.bosTau6 # swap mkdir /hive/data/genomes/bosTau6/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau6/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m13.925s cat fb.bosTau6.chainMm10Link.txt # 688651806 bases of 2649682029 (25.990%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau6/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # lastz Medium Ground Finch geoFor1 (DONE - 2012-07-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10 mkdir /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cd /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cat << '_EOF_' > DEF # Mouse vs. medium ground finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 251m4.194s cat fb.mm10.chainGeoFor1Link.txt # 93984241 bases of 2652783500 (3.543%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGeoFor1.2012-07-29 lastz.geoFor1 # and for the swap mkdir /hive/data/genomes/geoFor1/bed/blastz.mm10.swap cd /hive/data/genomes/geoFor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 10m0.875s cat fb.geoFor1.chainMm10Link.txt # 80273915 bases of 1041286029 (7.709%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # construct assembly fragments table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/bed/assemblyFrags cd /hive/data/genomes/mm10/bed/assemblyFrags zgrep -h -v "^#" "${F}" zgrep -h -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/*.comp.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e 's/CM000994.2/chr1/; s/CM000995.2/chr2/; s/CM000996.2/chr3/; s/CM000997.2/chr4/; s/CM000998.2/chr5/; s/CM000999.2/chr6/; s/CM001000.2/chr7/; s/CM001001.2/chr8/; s/CM001002.2/chr9/; s/CM001003.2/chr10/; s/CM001004.2/chr11/; s/CM001005.2/chr12/; s/CM001006.2/chr13/; s/CM001007.2/chr14/; s/CM001008.2/chr15/; s/CM001009.2/chr16/; s/CM001010.2/chr17/; s/CM001011.2/chr18/; s/CM001012.2/chr19/; s/CM001013.2/chrX/; s/CM001014.2/chrY/;' > chr.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e "s#GL456233.1#chrX_GL456233_random#; s#GL456216.1#chr4_GL456216_random#; s#JH584299.1#chr5_JH584299_random#; s#JH584301.1#chrY_JH584301_random#; s#JH584300.1#chrY_JH584300_random#; s#JH584303.1#chrY_JH584303_random#; s#JH584302.1#chrY_JH584302_random#; s#JH584298.1#chr5_JH584298_random#; s#JH584297.1#chr5_JH584297_random#; s#JH584296.1#chr5_JH584296_random#; s#JH584295.1#chr4_JH584295_random#; s#JH584294.1#chr4_JH584294_random#; s#JH584293.1#chr4_JH584293_random#; s#JH584292.1#chr4_JH584292_random#; s#GL456354.1#chr5_GL456354_random#; s#GL456350.1#chr4_GL456350_random#; s#GL456221.1#chr1_GL456221_random#; s#GL456219.1#chr7_GL456219_random#; s#GL456213.1#chr1_GL456213_random#; s#GL456212.1#chr1_GL456212_random#; s#GL456211.1#chr1_GL456211_random#; s#GL456210.1#chr1_GL456210_random#;" > chrUL.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' | sed -e 's/\.1\t/\t/' \ | awk '{printf "chrUn_%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ > chrUn.asmFrag.bed cat chr.asmFrag.bed chrUL.asmFrag.bed chrUn.asmFrag.bed > mm10.asmFrag.bed # add the chrM identity echo -e "chrM\t0\t1629\tAY172335.1\t0\t+" >> mm10.asmFrag.bed hgLoadBed mm10 assemblyFrags mm10.asmFrag.bed featureBits mm10 assemblyFrags # 2652769048 bases of 2652783500 (99.999%) in intersection # should be silent when all chr names are correct: checkTableCoords mm10 assemblyFrags ######################################################################### # construct ucscToEnsembl table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/ensembl cd /hive/data/genomes/mm10/ensembl wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.toplevel.fa.gz' wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.nonchromosomal.fa.gz' faCount *.fa.gz > faCount.txt egrep -v "total|seq" faCount.txt | awk '{print $1,$2}' \ | sort -u | sort -k2nr | sed -e "s/ /\t/" > ensembl.chrom.sizes mkdir /hive/data/genomes/mm10/bed/ucscToEnsembl cd /hive/data/genomes/mm10/bed/ucscToEnsembl awk '{printf "%d\t%s\n", $2,$1}' ../../chrom.sizes | sort > sizes.chrom.ucsc awk '{printf "%d\t%s\n", $2,$1}' ../../ensembl/ensembl.chrom.sizes \ | sort > sizes.chrom.ensembl join sizes.chrom.ucsc sizes.chrom.ensembl \ | awk '{printf "%s\t%s\n", $2,$3}' > ucscToEnsembl.tab cut -f1 ucscToEnsembl.tab | awk '{print length($1)}' | sort -rn | head -1 # 20 cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(20)) ); '_EOF_' hgLoadSqlTab mm10 ucscToEnsembl ucscToEnsembl.sql ucscToEnsembl.tab ######################################################################### # GRC Incident database (DONE - 2012-09-21 - Hiram) # updated the automatic scripts to include the build of this track # on Mm10 # this procedure is run as a cron job in Hiram's account: # 43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # using the two scrips there: runUpdate.sh and update.sh # which are checked into the source tree as files: # src/hg/utils/automation/grcIncidentUpdate.sh # src/hg/utils/automation/grcRunIncidentUpdate.sh # they fetch the XML files from NCBI, convert them to SQL text # files, construct a bigBed file, and pushes it to genomewiki if # it is an update from previous # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb # construct the table after running the script once manually: hgBbiDbLink mm10 grcIncidentDb \ "http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb" ######################################################################### # GRCm38.p1 patch 1 (DONE - 2012-09-21 - Hiram) mkdir /hive/data/genomes/mm10/bed/patch1 cd /hive/data/genomes/mm10/bed/patch1 rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38.p1/ ./genbank/ # slight modifications to this script from hg19 patch9 work: ./gatherNames.pl genbank > ucscNames.patch1.txt # examine the names for sanity: awk '{print $NF}' ucscNames.patch1.txt | sort # and they should not be longer than 31 characters: awk '{print $NF}' ucscNames.patch1.txt | sort | awk '{print length($0)}' \ | sort -n | tail # script from hg19 patch9, update the variable patchName ./mkTables.pl patches.chrom.sizes ucscNames.patch1.txt genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz # output to stdout is the contents of alt.scaf.agp.gz # constructs ctgPos.txt chromInfo.txt gap.txt gold.txt # script from hg19 patch9, update the variable patchName ./mkCtgPos2.pl ucscNames.patch1.txt patches.chrom.sizes > ctgPos2.txt cp -p ../patch5/mkHapLocate.pl . ./mkHapLocate.pl ctgPos.txt \ PATCHES/alt_scaffolds/alt_scaffold_placement.txt \ > haplotypeLocations.bed cp -p haplotypeLocations.bed altSequence.bed ./mkFasta.pl ucscNames.patch1.txt > mm10.patch1.fa # the build of mm10Patch1 can be seen in mm10Patch1.txt egrep -v "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqPatchesP1.tab # no haplotypes yet, this is nothing: egrep "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqHaplotypesP1.tab # verify none lost wc -l altSequence.bed altSeqPatchesP1.tab altSeqHaplotypesP1.tab # 9 altSequence.bed # 9 altSeqPatchesP1.tab # 0 altSeqHaplotypesP1.tab # not necessary, there are none yet: hgLoadBed mm10 altSeqHaplotypesP1 altSeqHaplotypesP1.tab # Loaded 75 elements of size 6 hgLoadBed mm10 altSeqPatchesP1 altSeqPatchesP1.tab # Read 9 elements of size 6 from altSeqPatchesP1.tab # these tables are part of mouse/mm10/altSeqComposite1.ra ############################################################################## # Haplotype track (WORKING - 2012-10-01 - Hiram) # Warning: these are all actually alternate scaffolds from OTHER mouse strains # These haplotypes are NOT from mm10. Probably the table should have been called NonMm10Haplotypes! # The directory after genbank/ identifies the strain, e.g. 129S2_SvPas #../../../mm10/genbank/129S2_SvPas/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129P2_OlaHsd/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/NOD_ShiLtJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/A_J/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/CAST_Ei/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129X1_SvJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/AKR_J/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/RIII/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S6_SvEvTac/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S7_SvEvBrd-Hprt-b-m2/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/BALB_c/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S1_SvImJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/NOD_MrkTac/alt_scaffolds/alt_scaffold_placement.txt cat << '_EOF_' > mkBedFile.pl #!/usr/bin/env perl use strict; use warnings; my $debug = 1; sub usage() { print STDERR "usage: ./mkBedFile.pl ../../mm10/genbank > mm10Haplotypes.bed\n"; print STDERR "expecting the Mus_musculus/GRCm38.p1/ hierarchy in ./genbank from NCBI\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $patchDir = shift; if ( ! -d $patchDir ) { print STDERR "ERROR: given directory $patchDir is not a directory or does not exist"; usage; } my %glSize; my %ctgToChr; my %ctgToFastaName; # my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz"; my @placeList = split('\n',`find $patchDir -type f | grep placement.txt | grep alt_scaffolds | grep -v UNKNOWN`); for (my $i = 0; $i < scalar(@placeList); ++$i) { printf STDERR "# %s\n", $placeList[$i]; open (FH, "grep -v '^#' $placeList[$i]|") or die "can not read $placeList[$i]"; while (my $line = <FH>) { # printf STDERR "%s", $line; chomp $line; my @a = split('\s+', $line); next if ($a[11] eq "na"); $a[8] = "+" if ($a[8] eq "b"); my $descr = sprintf("<B>Region name: </B>%s", $a[7]); printf "chr%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\n", $a[5], $a[11], $a[12], $a[0], $a[8], $a[3], $descr; } close (FH); } '_EOF_' # << happy emacs chmod +x mkBedFile.pl ./mkBedFile.pl > mm10Haplotypes.bedDetail cat << '_EOF_' > mm10Haplotypes.sql CREATE TABLE mm10Haplotypes ( chrom varchar(255) not null, # Reference sequence chromosome or scaffold chromStart int unsigned not null, # Start position in chromosome chromEnd int unsigned not null, # End position in chromosome name varchar(255) not null, # Short Name of item score int unsigned, # Score from 0-1000 strand char(1), # + or - id varchar(255) not null, # ID to bed used in URL to link back description longblob not null, # Long description of item for the details page #Indices INDEX(chrom, chromStart) ); '_EOF_' hgLoadSqlTab mm10 mm10Haplotypes mm10Haplotypes.sql mm10Haplotypes.bedDetail # trackDb entry: track mm10Haplotypes shortLabel Alt. strains longLabel Alternate mouse strains, mapped to reference as haplotypes group varRep priority 111 visibility hide type bedDetail 8 url http://www.ncbi.nlm.nih.gov/nuccore/$$ urlLabel NCBI Nucleotide: ########################################################################## ## CYTOBAND - ideogram track (DONE - 2012-10-19 - Hiram) ssh hgwdev mkdir -p /hive/data/outside/ncbi/ideogram/2012-10 cd /hive/data/outside/ncbi/ideogram/2012-10 # fetch all the ideogram files: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ mkdir /hive/data/genomes/mm10/bed/cytoband cd /hive/data/genomes/mm10/bed/cytoband # Create bed file $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ /hive/data/outside/ncbi/ideogram/2012-10/ideogram_10090_GCF_000000055.19_NA_V2 ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 21 chroms # Load the bed file hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ mm10 cytoBand cytoBand.bed # Read 403 elements of size 5 from cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" mm10 hgsql mm10 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" ########################################################################## # CYTOBANDIDEO update - (DONE - 2013-02-27 - kuhn) # adding rows for chroms with no cytology # this is just for navigation/orientation on those chroms set db=mm10 set sql=~/kent/src/hg/lib/cytoBandIdeo.sql # make backup of existing table hgsql -e "CREATE TABLE cytoBandIdeoCopy SELECT * FROM cytoBandIdeo" $db # dump existing table hgsql -N -e "SELECT * FROM cytoBandIdeo" $db > $db.cytoBandIdeo # find chroms already covered hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' $db \ | sort -u > $db.coveredNames # make cytoBand records for chroms not already covered hgsql -N -e 'SELECT chrom, size FROM chromInfo' $db \ | grep -wvf $db.coveredNames \ | awk '{print $1"\t0\t"$2"\t\tgneg"}' > $db.cytoBandNew # check wc -l $db.* # combine and sort cat $db.cytoBandNew $db.cytoBandIdeo > $db.cytoBandIdeoFull bedSort $db.cytoBandIdeoFull $db.cytoBandIdeoFull # replace exsting table hgsql -e "DROP TABLE cytoBandIdeo" $db hgLoadSqlTab $db cytoBandIdeo $sql $db.cytoBandIdeoFull # check and then drop copy ########################################################################## # lastz Lamprey petMar2 (DONE - 2012-10-17 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S petMar2 mkdir /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cd /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cat << '_EOF_' > DEF # Mouse vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Lamprey PetMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -qRepeats=windowmaskerSdust \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 218m29.078s cat fb.mm10.chainPetMar2Link.txt # 28262565 bases of 2652783500 (1.065%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar2.2012-10-19 lastz.petMar2 # and for the swap mkdir /hive/data/genomes/petMar2/bed/blastz.mm10.swap cd /hive/data/genomes/petMar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m2.754s cat fb.petMar2.chainHg19Link.txt # 20923095 bases of 647368134 (3.232%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz White Rhino cerSim1 (DONE - 2012-10-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CerSim1 mkdir /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cd /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cat << '_EOF_' > DEF # Mouse vs. White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 992m45.890s cat fb.mm10.chainCerSim1Link.txt # 942281365 bases of 2652783500 (35.520%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCerSim1.2012-10-23 lastz.cerSim1 # and for the swap mkdir /hive/data/genomes/cerSim1/bed/blastz.mm10.swap cd /hive/data/genomes/cerSim1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m44s cat fb.cerSim1.chainMm10Link.txt # 926131511 bases of 2366858012 (39.129%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # QPCR PRIMERS (DONE - 2012-12-10 - Chin) # The track name is changed to "qPCR Primers" # Reload table with new track_mouse.BED (2013-01-28) # Download mkdir /hive/data/outside/Weizmann/qPcrPrimers cd /hive/data/outside/Weizmann/qPcrPrimers wget http://www.weizmann.ac.il/complex/compphys/software/Amit/primers/mouse/track_mouse.BED mkdir -p /hive/data/genomes/mm10/bed/qPcrPrimers cat track_mouse.BED | grep -v track \ > /hive/data/genomes/mm10/bed/qPcrPrimers/qPcrPrimers_mm10.bed cd /hive/data/genomes/mm10/bed/qPcrPrimers hgLoadBed -bedDetail -tab -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/bedDetail.sql \ mm10 qPcrPrimers qPcrPrimers_mm10.bed # Reading qPcrPrimers_mm10.bed # Read 518230 elements of size 14 from qPcrPrimers_mm10.bed # Sorted # Creating table definition for qPcrPrimers # Saving bed.tab # Loading mm10 # NULL descrition column hgsql mm10 -ne "UPDATE qPcrPrimers SET description = NULL;" ######################################################################### # DBSNP B137 / SNP137 (DONE 12/20/12 angie) # Redmine #7043 mkdir -p /hive/data/outside/dbSNP/137/mouse cd /hive/data/outside/dbSNP/137/mouse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_*_([0-9]+_[0-9]) # -- use the first num for build and the second num_num for buildAssembly. # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp. # # Some trial and error was required to get the config.ra just right -- # the b* filenames don't include buildAssembly! # patch contigs needed to be filtered out: cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/genomes/mm10/jkStuff/liftContigs.lft EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log # Script ended with feedback about needing refAssemblyLabel because dbSNP # mapped to more than one assembly; add the label that clearly corresponds to # mm10, GRCm38, to config.ra and try again: cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/outside/dbSNP/137/mouse/suggested.lft refAssemblyLabel GRCm38 EOF ~/kent/src/hg/utils/automation/doDbSnp.pl -continue=loadDbSnp \ config.ra >>& do.log & tail -f do.log # Script ended with feedback about unrecognized NT_* contigs from dbSNP. # Inspect the script-generated suggested.lft for liftUp; it's usually right. # For contigs that are labeled as part of GRCm38 but not liftable to mm10, # listed in script-generated cantLiftUpSeqNames.txt, do some entrez # nucleotide searches for contig IDs and convince yourself that they're all # for alt assembly sequences that we don't include in mm10 (e.g. patches, # other strains). Then tell the script to filter out those contigs: cut -f 2 cantLiftUpSeqNames.txt > ignoreAltAssemblyContigs.txt cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/outside/dbSNP/137/mouse/suggested.lft refAssemblyLabel GRCm38 ignoreDbSnpContigsFile /hive/data/outside/dbSNP/137/mouse/ignoreAltAssemblyContigs.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl -continue=loadDbSnp \ config.ra >>& do.log & tail -f do.log # ... #MultipleAlignments 1667342 This variant aligns in more than one location. #ObservedMismatch 4561144 UCSC reference allele does not match any observed allele from dbSNP. # # *** All done! # That is an unusually high count of ObservedMismatch... follow up with dbSNP. ############################################################################# # FILTER SNP137 (DONE 12/21/12 angie) # Redmine #7043 # Make several tracks that are filtered subsets of snp137: # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp137Mult # Second, siphon off the common variants -> snp137Common # Third, take the (uniquely mapped, not known to be common) variants # w/dbSNP's "clinically-assoc" flag -> snp137Flagged cd /hive/data/outside/dbSNP/137/mouse zcat snp137.bed.gz \ | perl -we \ '$minTotal2N = 10; \ ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \ open($mult, "| gzip -c > snp137Mult.bed.gz") || die; \ open($common, "| gzip -c > snp137Common.bed.gz") || die; \ open($flagged, "| gzip -c > snp137Flagged.bed.gz") || die; \ open($misc, "| gzip -c > snp137Misc.bed.gz") || die; \ while (<>) { \ @w = split("\t"); \ if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \ print $mult $_; \ $multCount++; \ } else { \ my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \ my @alNs = split(",", $nStr); die unless scalar(@alNs) == $alleleFreqCount; \ my @freqs = split(",", $freqStr); die unless scalar(@freqs) == $alleleFreqCount; \ my ($total2N, $maxAlleleFreq) = (0, 0); \ for (my $i = 0; $i < $alleleFreqCount; $i++) { \ $total2N += $alNs[$i]; \ $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \ } \ if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \ print $common $_; \ $comCount++; \ } elsif($w[24] =~ /clinically-assoc/) { \ print $flagged $_; \ $flagCount++; \ } else { \ print $misc $_; \ $miscCount++; \ } \ } \ } \ close($mult); close($common); close($flagged); close($misc); \ print "snp137Mult: $multCount\nsnp137Common: $comCount\nsnp137Flagged: $flagCount\n" . \ "leftover: $miscCount\n";' #snp137Mult: 1671771 #snp137Common: 2709532 #snp137Flagged: 0 #leftover: 66537658 # It's expected for snp137Flagged to be empty because that's for human SNPs. # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp137$subset -sqlTable=snp137.sql snp137$subset.bed.gz end ############################################################################ # DBSNP CODING ANNOTATIONS (137) (DONE 12/21/12 angie) # Redmine #7043 cd /hive/data/outside/dbSNP/137/mouse # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #1884989 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 371388 3 (coding-synon) #1301099 8 (cds-reference -- ignored) # 3465 41 (nonsense) # 199148 42 (missense) # 319 43 (stop-loss) # 7422 44 (frameshift) # 2148 45 (cds-indel) # In b137, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp137CodingDbSnp.bed /hive/data/outside/dbSNP/137/mouse/suggested.lft warn stdin hgLoadBed mm10 snp137CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp137CodingDbSnp.bed #Read 552120 elements of size 11 from snp137CodingDbSnp.bed ######################################################################### # RETROPOSED GENES ucscRetro track VERSION 2 # (2013-04-03 - 2013-04-17, baertsch,hartera DONE) mkdir -p /hive/hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 mkdir -p /hive/data/genomes/mm10/bed/retro/ cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " RUNDATE="2013-04-03" DB=mm10 SCORETHRESH=510 GENOMENAME='Mus musculus' GBDB=mm DATE=20130403 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz TMPMRNA=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/mrnaBlastz/$DB TMPEST=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/est/$DB BINDIR=/hive/users/hartera/GencodeWG/retroFinder/trunk/bin EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/trunk/src/pipeline GENOME=/hive/data/genomes TWOBIT=$GENOME/$DB/$DB.2bit RETRODIR=$GENOME/$DB/bed/retro BASE=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/retro VERSION=2 OUTDIR=${BASE}/${DB}.${VERSION} RESULT=$OUTDIR/result LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o TABLE=ucscRetroInfo$VERSION ORTHOTABLE=ucscRetroOrtho$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB NIB=$LOCAL/nib RMSK=rmsk NET1=netHg19 NET2=netCanFam3 NET3=netRn5 GENE1=knownGene GENE2=refGene GENE3=ensGene CLUSTER=swarm SPECIES="hg19 mm10" ROOTDIR="/cluster/home/$USER/public_html/retro/mm10Apr13" WEBROOT=$ROOTDIR/retro.$RUNDATE WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE=affyGnf1m ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio ARRAYABS=hgFixed.gnfMouseAtlas2All ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps # ARRAYLOOKUP=knownToGnfAtlas2 #ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl" ALTSPLICE=sibTxGraph SPLITBYAGE=splitRetrosByAgeMouse PDB=proteins121210 BREAKS=0,8,16,24,32 XLIM=34 YLIM=0.1 YLIM1=4000 YLIM2=160 MAXDIVERGENCE=32 '_EOF_' # << happy emacs chmod +x DEF mkdir mrnaBlastz cd mrnaBlastz cp ../DEF . # Create S1.len: cp /hive/data/genomes/mm10/chrom.sizes S1.len # Edit S1.len and remove chrM and random chroms then copy over to mm10 # genomes directory mkdir -p /hive/data/genomes/mm10/bed/mrnaBlastz cp S1.len /hive/data/genomes/mm10/bed/mrnaBlastz screen # Run steps 1 to 6 of RetroFinder pipeline from scripts in CCDS SVN source tree: retroFinder/trunk/src/pipeline/ucscStep1.sh DEF # check cluster job on swarm retroFinder/trunk/src/pipeline/ucscStep2.sh DEF retroFinder/trunk/src/pipeline/ucscStep3.sh DEF #check cluster job retroFinder/trunk/src/pipeline/ucscStep4.sh DEF #check cluster job # Load the track retroFinder/trunk/src/pipeline/ucscStep5.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403/retro/mm10.2 retroFinder/trunk/src/pipeline/filterMrna.sh DEF retroFinder/trunk/src/pipeline/filterEst.sh DEF retroFinder/trunk/src/pipeline/analyseExpress.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403/mrnaBlastz retroFinder/trunk/src/pipeline/ucscStep6.sh DEF #added ucscRetroAli to trackDb.ra # copied # /hive/groups/gencode/pseudogenes/retroFinder/mm10/20130403/retro/mm10.2/trackDb.retro # entry to kent/src/hg/makeDb/trackDb/mouse/mm10/trackDb.ra # and edited it to add version number and date. # Scripts copied ucscRetroAli2.psl, ucscRetroInfo2.bed and ucscRetroCds2.tab # to /hive/data/genomes/mm10/bed/retro/ ############################################################################## # LASTZ shrew sorAra2 (DONE - 2013-06-12 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SorAra2 mkdir /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 cd /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: shrew SorAra2 SEQ2_DIR=/hive/data/genomes/sorAra2/sorAra2.2bit SEQ2_LEN=/hive/data/genomes/sorAra2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 785m32.163s cat fb.mm10.chainSorAra2Link.txt # 354499462 bases of 2652783500 (13.363%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSorAra2.2013-06-12 lastz.sorAra2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 time doRecipBest.pl mm10 sorAra2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 24m38.069s mkdir /hive/data/genomes/sorAra2/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 39m53.463s cat fb.sorAra2.chainMm10Link.txt # 343760052 bases of 2192103426 (15.682%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tenrec echTel2 (DONE - 2013-06-12 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EchTel2 mkdir /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 cd /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tenrec vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tenrec EchTel2 SEQ2_DIR=/hive/data/genomes/echTel2/echTel2.2bit SEQ2_LEN=/hive/data/genomes/echTel2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1006m3.874s cat fb.mm10.chainEchTel2Link.txt # 384570981 bases of 2652783500 (14.497%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEchTel2.2013-06-12 lastz.echTel2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 time doRecipBest.pl mm10 echTel2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 27m58.816s # and, for the swap mkdir /hive/data/genomes/echTel2/bed/blastz.mm10.swap cd /hive/data/genomes/echTel2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 43m0.194s cat fb.echTel2.chainMm10Link.txt # 380872172 bases of 2605196361 (14.620%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac2 (DONE - 2013-06-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10VicPac2 mkdir /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 cd /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # mouse vs alpaca BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac2 SEQ2_DIR=/hive/data/genomes/vicPac2/vicPac2.2bit SEQ2_LEN=/hive/data/genomes/vicPac2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2156m48.687s cat fb.mm10.chainVicPac2Link.txt # 797843091 bases of 2652783500 (30.076%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzVicPac2.2013-06-19 lastz.vicPac2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 time doRecipBest.pl mm10 vicPac2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 33m49.271s mkdir /hive/data/genomes/vicPac2/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 85m53.924s cat fb.vicPac2.chainMm10Link.txt # 783682127 bases of 2078582856 (37.703%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # create ucscToINSDC name mapping (DONE - 2013-08-15 - Hiram) # this allows the "ensembl" blue bar button to appear mkdir /hive/data/genomes/mm10/bed/ucscToINSDC cd /hive/data/genomes/mm10/bed/ucscToINSDC cat << '_EOF_' > translateNames.sh #!/bin/sh grep -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/chr2acc \ | sed -e 's/^/chr/' zcat ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | grep -v "^#" | cut -f1 | sort -u \ | sed -e 's/^\([A-Za-z0-9]*\).\([0-9]*\)/chrUn_\1\t\1.\2/;' grep -v "^#" \ ../../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf \ | sed -e 's/^\([A-Za-z0-9]*\)\t\([A-Za-z0-9]*\).\([0-9]*\)/chr\1_\2_random\t\2.\3/;' echo -e "chrM\tNC_005089.1" '_EOF_' # << happy emacs chmod +x translateNames.sh ./translateNames.sh | sort > ucscToINSDC.txt join <(sort ../../chrom.sizes) ucscToINSDC.txt \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' > ucscToINSDC.tab # maximum size of UCSC chrom name for SQL index cut -f1 ucscToINSDC.tab | awk '{print length($0)}' | sort -n | tail -1 # 20 sed -e 's/21/20/' $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab mm10 ucscToINSDC stdin ucscToINSDC.tab # verify the track link to INSDC functions ############################################################################## # MGI LIFTOVER FROM mm9 ( 2013-11-14 Pauline) ssh kolossus mkdir /cluster/data/mm10/bed/jaxLiftOver cd /cluster/data/mm10/bed/jaxLiftOver liftOver -minBlocks=0.5 /cluster/data/mm9/bed/jax/2011_06/jaxQtl.bed \ /cluster/data/mm9/bed/liftOver/mm9ToMm10.over.chain.gz \ -bedPlus=6 -tab jaxQtlLift.{bed,unmapped} wc -l jaxQtlLift.{bed,unmapped} #Old 1539 jaxQtlLift.bed #Old 12 jaxQtlLift.unmapped # 1883 jaxQtlLift.bed # 14 jaxQtlLift.unmapped # Numbers are of same order of magnitude (yay?) proceeding... # Load lifted track tables and original auxiliary tables: ssh hgwdev cd /cluster/data/mm10/bed/jaxLiftOver # jaxQTLLift #didn't run this sed command (prob already been done to this file?) sed -e 's/jaxQTL/jaxQTLLift/g'\ ~/kent/src/hg/lib/jaxQTL.sql > jaxQTLLift.sql #ran this (used this instead of hgLoadBed at Hiram's suggestion): hgLoadSqlTab mm10 JaxQtl $HOME/kent/src/hg/lib/jaxQtl.sql \ /cluster/data/mm10/bed/jaxLiftOver/jaxQtlLift.bed checkTableCoords mm10 JaxQTLLift #got no output (yay!) #found out hgLoadSqlTab doesn't load a positionally sorted table, sorting bed #file and reloading: sort -k1,1 -k2,2n jaxQtlLift.bed > jaxQtlLiftSorted.bed hgLoadSqlTab mm10 jaxQtl $HOME/kent/src/hg/lib/jaxQtl.sql \ /cluster/data/mm10/bed/jaxLiftOver/jaxQtlLiftSorted.bed ############################################################################## # DBSNP B138 / SNP138 (DONE 1/17/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/138/mouse cd /hive/data/outside/dbSNP/138/mouse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 138 buildAssembly EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log # Some trial and error was required to get the config.ra just right. # First stop: need a refAssemblyLabel: # *** This release contains more than one assembly label. # *** Please examine this list in case we need to exclude any of these: # #GRCm38.p1 #Mm_Celera # *** Add refAssemblyLabel to config.ra. If keeping all labels, it will # *** look like this: # #refAssemblyLabel GRCm38.p1,Mm_Celera # # *** Edit out any of those that are not included in mm10 (e.g. Celera). # *** Then restart this script with -continue=loadDbSnp . cat >> config.ra <<EOF refAssemblyLabel GRCm38.p1 EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # Second stop: need to grab the NCBI Assembly Reports file for GRCm38; the # script will do its best to deduce the needed liftUp entries and contigs # to ignore (because they are for alternate mouse strains, or patch contigs etc). #*** b138_ContigInfo has coords for 119 sequences; these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . #*** 152 lines of b138_ContigInfo.bcp.gz either had no lift-coords #*** or had unrecognized chrom names; see #*** /hive/data/outside/dbSNP/138/mouse/cantLiftUpSeqNames.txt . # #*** You must account for those in config.ra, in the liftUp file #*** and/or ignoreDbSnpContigsFile or the ignoreDbSnpContigs regex. #*** Then run again with -continue=loadDbSnp . # #*** NOTE: If you add the ncbiAssemblyReportFile setting to config.ra and #*** run again with -continue=loadDbSnp, this script may be able to #*** construct those files for you. # Look at the doDbSnp.pl -help message for instructions about how to find the # Assembly Reports file for GRCm38 on the NCBI web site. wget ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001635.22.assembly.txt cat >> config.ra <<EOF ncbiAssemblyReportFile GCF_000001635.22.assembly.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # Third stop: review the list of dbSNP contigs that we can't map, and if they're # all contigs not in our assembly, tell config.ra to ignore them. #*** b138_ContigInfo has coords for 119 sequences; these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . # #*** GCF_000001635.22.assembly.txt has mappings for 44 sequences; #*** these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . # #*** 108 lines of b138_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/138/mouse/cantLiftUpSeqNames.txt . # #*** You must account for all 271 contig_acc values in config.ra, #*** in the liftUp file and/or ignoreDbSnpContigsFile. #*** Then run again with -continue=loadDbSnp . cut -f 2 cantLiftUpSeqNames.txt > contigsNotInUCSC.txt cat >> config.ra <<EOF liftUp suggested.lft ignoreDbSnpContigsFile contigsNotInUCSC.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # The script died with an error implying that a perl command in a pipe got # empty input from sort which was getting input from an hgsql query to join # Batch submitter handles with rs# snp_id's. Looks like the mysql connection # was lost or something. Anyway, re-running that part of addToDbSnp.csh # in 2 parts and continuing manually through the end of addToDbSnp.csh: pushd `cat workingDir ` hgsql mm10snp138 -NBe 'select SNPSubSNPLink.snp_id, handle from SubSNP, SNPSubSNPLink, Batch \ where SubSNP.subsnp_id = SNPSubSNPLink.subsnp_id and \ SubSNP.batch_id = Batch.batch_id' \ | sort -k1n,1n -k2,2 -u \ > tmp.txt perl -we 'while (<>) { \ chomp; my ($id, $handle) = split("\t"); \ if (defined $prevId && $prevId != $id) { \ print "$prevId\t$handleCount\t$handleBlob\n"; \ $handleCount = 0; $handleBlob = ""; \ } \ $handleCount++; \ $handleBlob .= "$handle,"; \ $prevId = $id; \ } \ print "$prevId\t$handleCount\t$handleBlob\n";' \ tmp.txt > ucscHandles.txt cat > ucscHandles.sql <<EOF CREATE TABLE ucscHandles ( snp_id int NOT NULL, handleCount int unsigned NOT NULL, handles longblob NOT NULL, INDEX snp_id (snp_id) ); EOF hgLoadSqlTab mm10snp138 ucscHandles{,.sql,.txt} # I added 'if (0) then' around the parts of addToDbSnp.csh that completed successfully; # complete the step by running the modified script: # Pop back out of workingDir popd addToDbSnp.csh >>& do.log & tail -f do.log # Now continue with the next step: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin >>& do.log & tail -f do.log # *** All done! ############################################################################## # FILTER SNP138 (DONE 1/17/14 angie) cd /hive/data/outside/dbSNP/138/mouse zcat snp138.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 3066546 #Common: 8082414 #Flagged: 0 #leftover: 60824824 foreach f ({Mult,Common}.bed.gz) mv $f snp138$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (138) (DONE 1/17/14 angie) cd /hive/data/outside/dbSNP/138/mouse # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #1584257 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 372821 3 (coding-synon) # 552828 8 (cds-reference -- ignored) # 376 41 (nonsense) # 181984 42 (missense) # 49 43 (stop-loss) # 3382 44 (frameshift) # 472817 45 (cds-indel) # In b138, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin hgLoadBed mm10 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp138CodingDbSnp.bed #Read 1025678 elements of size 11 from snp138CodingDbSnp.bed ############################################################################## 2013-12-13: import of UCSC GENCODE group processing of GENCODE VM2 (markd) mkdir -p /hive/data/genomes/mm10/bed/gencodeVM2 cd /hive/data/genomes/mm10/bed/gencodeVM2 # create Makefile from previous one. cp /hive/data/genomes/hg19/bed/gencodeV19/Makefile . # download, build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. # NOT DONE THIS TIME, SINCE THIS is the first mouse. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM2 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp human/mm10/wgEncodeGencodeV18.ra human/mm10/wgEncodeGencodeVM2.ra cp human/mm10/wgEncodeGencodeV18.html human/mm10/wgEncodeGencodeVM2.html # edit these plus human/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM2.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous -0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM2", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM2 make joinerCheck # see output in check/joiner.out ############################################################################## # SEGMENTAL DUPLICATIONS (WORKING 4/14/14 Pauline) # File emailed from John Huddleston (jlhudd@uw.edu) in the Eichler Lab. mkdir /hive/data/genomes/mm10/bed/genomicSuperDups cd /hive/data/genomes/mm10/bed/genomicSuperDups wget --timestamping 'http://mouseparalogy.gs.washington.edu/GRCm38/genomicSuperDup.tab' mv genomicSuperDup.tab mm10_WGAC.tab awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm10_WGAC.tab \ | hgLoadBed mm10 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # mm8 version of track had issue where strand values were "+" and "_" -- # checked and found same issue - so ran same fix: hgsql mm10 -e 'update genomicSuperDups set strand = "-" where strand = "_";' #new mm10 version has a lot more stuff than version on mm8: #featureBits mm8 genomicSuperDups #157417547 bases of 2567283971 (6.132%) in intersection #featureBits mm10 genomicSuperDups #214917441 bases of 2652783500 (8.102%) in intersection #select count(*) from genomicSuperDups; #659775 (vs. 277816 in mm8) # ######################################################################### # hgPal downloads (DONE braney 2009-11-03) # FASTA from 60way for refGene, knownGene, knownCanonical ssh hgwdev screen bash rm -rf /cluster/data/mm10/bed/multiz60way/pal mkdir /cluster/data/mm10/bed/multiz60way/pal cd /cluster/data/mm10/bed/multiz60way/pal for i in `cat ../species.list`; do echo $i; done > order.lst mz=multiz60way gp=refGene db=mm10 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs nice time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # 1817.21user 233.92system 4:54:04elapsed 11%CPU (0avgtext+0avgdata # 920192maxresident)k # 6024inputs+0outputs (7major+1648126minor)pagefaults 0swaps mz=multiz60way gp=refGene db=mm10 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz60way gp=refGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz60way gp=knownGene db=mm10 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # oops... missed the timing mz=multiz60way gp=knownGene db=mm10 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now do the canonical set cd /cluster/data/mm10/bed/multiz60way/pal mz=multiz60way gp=knownCanonical db=mm10 for j in `awk '{print $1}' /cluster/data/mm10/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 302m20.489s # user 27m31.179s # sys 5m30.071s rm *.known.bed mz=multiz60way gp=knownCanonical db=mm10 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz60way gp=knownCanonical db=mm10 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ############################################################################## # LASTZ Rhesus rheMac2 (DONE - 2014-05-23 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 cd /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 cat << '_EOF_' > DEF # rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac2 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10RheMac2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 257m21.255s cat fb.mm10.chainRheMac2Link.txt # 895296744 bases of 2652783500 (33.749%) in intersection mkdir /hive/data/genomes/rheMac2/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m36.098s cat fb.rheMac2.chainMm10Link.txt # 875700775 bases of 2646704109 (33.086%) in intersection ############################################################################ # FaceBase Microarray track (DONE - 2014-05-21 - Pauline) # establish a screen to control this job with a name to indicate what it is mkdir /hive/data/genomes/mm10/bed/FaceBase24SampleTypesAvg cd /hive/data/genomes/mm10/bed/FaceBase24SampleTypesAvg wget --timestamping http://genomebrowser.facebase.org/myHub/mm10/FaceBase_24Samp_Types_Averaged.bed hgLoadBed mm10 FaceBase24SampleTypesAvg FaceBase_24Samp_Types_Averaged.bed #For microarray tracks also need to add a section to #/cluster/home/pauline/kent/src/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra ############################################################################## # RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/rmskJoined cd /hive/data/genomes/mm10/bed/rmskJoined ln -s ../repeatMasker/mm10.sorted.fa.out . ln -s ../repeatMasker/mm10.fa.align.gz . # working on fixing this script for the next release of RM # since mm10 was an older version of RM, this conversion needs the # bedtools, thus the extra PATH business export PATH=/cluster/bin/bedtools:$PATH /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \ -out mm10.sorted.fa.out -align mm10.fa.align.gz hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ -renameSqlTable -verbose=4 -tab \ -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as mm10 \ rmskJoinedBaseline mm10.sorted.fa.join.bed \ > loadJoined.log 2>&1 hgLoadSqlTab mm10 rmskAlignBaseline \ /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \ mm10.fa.align.tsv > loadAlign.log 2>&1 hgLoadOutJoined -verbose=2 mm10 mm10.sorted.fa.out > loadOut.log 2>&1 featureBits -countGaps mm10 rmskJoinedBaseline # 2243474717 bases of 2730871774 (82.152%) in intersection ############################################################################## # cloneEnds (DONE - 2014-08-11 - Steve) mkdir /hive/data/genomes/mm10/bed/cloneEnds cd /hive/data/genomes/mm10/bed/cloneEnds # fetch the NCBI INSDC name correspondence file: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001635.23.assembly.txt ./ # fetch the clone reports mkdir reports rsync -a -P \ rsync://ftp.ncbi.nih.gov/repository/clone/reports/Mus_musculus/*.GCF_000001635.22.103.*.gff \ ./reports/ # script to establish refSeq to UCSC chrom names: cat << '_EOF_' > refSeqNames.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<GCF_000001635.23.assembly.txt") or die "can not read GCF_000001635.23.assembly.txt"; while (my $line = <FH>) { chomp $line; next if ($line =~ m/^#/); my @a = split('\t', $line); my $chrN = $a[2]; my $refSeq = $a[6]; my $contig = $a[4]; my $type = $a[1]; next if (!defined $type); next if (!defined $refSeq); next if (!defined $contig); my $suffix = ""; if ($type eq "alt-scaffold") { $suffix = "_alt"; } elsif ($type eq "unlocalized-scaffold") { $suffix = "_random"; } elsif ($type eq "unplaced-scaffold") { $chrN = "Un"; } $chrN = "M" if ($chrN eq "MT"); if ($a[0] =~ m/_/) { $contig =~ s/\.[0-9]//; printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix; } else { printf "%s\tchr%s\n", $refSeq, $chrN; } } close (FH); '_EOF_' # << happy emacs chmod +x refSeqNames.pl ./refSeqNames.pl > refSeq.ucscName.tab # establish full library list: ls reports/*.GCF_000001635.22.103.*.gff | sed -e 's#reports/##' \ | cut -d"." -f1 | sort -u > library.list.txt # a script to scan the GFF files, with the refSeq.ucscName.tab # name correspondence to construct bed files cat << '_EOF_' > mm10.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc < 1) { printf STDERR "usage: ./mm10.pl <report.gff> [moreReports.gff]\n"; exit 255; } my %refSeqToUcsc; # key is refSeq name, value is UCSC chrom name open (FH, "<refSeq.ucscName.tab") or die "can not read refSeq.ucscName.tab"; while (my $line = <FH>) { chomp $line; my ($refSeq, $ucsc) = split('\t', $line); $refSeqToUcsc{$refSeq} = $ucsc; } close (FH); my %chromSizes; # key is UCSC chrom name, key is chrom size open (FH, "</hive/data/genomes/mm10/chrom.sizes") or die "can not read mm10/chrom.sizes"; while (my $line = <FH>) { chomp $line; my ($chr, $size) = split('\t', $line); $chromSizes{$chr} = $size; } close (FH); while (my $file = shift) { my %starts; # key is parent ID, value is start end coordinates start,end my %ends; # key is parent ID, value is end end coordinates start,end my %parents; # key is parent ID, value is 1 to signify exists my %endNames; # key is parent ID, value is the Name of the parent clone_insert printf STDERR "# processing $file\n"; open (FH, "<$file") or die "can not read $file"; while (my $line = <FH>) { chomp $line; next if ($line=~ m/^#/); my @a = split('\t', $line); next if (scalar(@a) < 1); my $contig = $a[0]; $contig =~ s/ref.//; $contig =~ s/\|//; my $ucscChr = $refSeqToUcsc{$contig}; if (!defined($ucscChr)) { printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n"; next; } next if (! exists($chromSizes{$ucscChr})); my $chromSize = $chromSizes{$ucscChr}; my $chromStart = $a[3] - 1; my $chromEnd = $a[4]; if ($chromStart > $chromSize) { printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n"; $chromStart = $chromSize-1; } if ($chromEnd > $chromSize) { my $overRun = $chromEnd - $chromSize; printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n"; $chromEnd = $chromSize; } my $id="notFound"; my $name="notFound"; my $parent="notFound"; my @b = split(';', $a[8]); for (my $i = 0; $i < scalar(@b); ++$i) { my ($tag, $value) = split('=', $b[$i]); if ($tag eq "ID") { $id = $value; if ($id !~ m/-/) { if (exists($parents{$id})) { printf STDERR "# WARN: duplicate parent: $id"; } else { $parents{$id} = $ucscChr; } } } elsif ($tag eq "Parent") { $parent = $value; } elsif ($tag eq "Name") { $name = $value; } } my $type="notFound"; my $insertType = $a[2]; if ($insertType =~ m/clone_insert_start/) { $type = "start"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for start $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: start found $name with no parent $parent declared\n"; } elsif (exists($starts{$parent})) { printf STDERR "# ERR: duplicate start for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert_end/) { $type = "end"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for end $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: end found $name with no parent $parent declared\n"; } elsif (exists($ends{$parent})) { printf STDERR "# ERR: duplicate end for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert/) { $type = "insert"; $endNames{$id} = $name; } $name =~ s/gi\|//g; $id =~ s/gi\|//g; printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6]; } # while (my $line = <FH>) close (FH); foreach my $parent (keys %parents) { if (! exists($starts{$parent}) ) { printf STDERR "# ERR: no start for $parent\n"; } elsif (! exists($ends{$parent}) ) { printf STDERR "# ERR: no end for $parent\n"; } else { my $strand = "+"; my $chrStart = 0; my $chrEnd = 0; my $blockStart = 0; my ($sStart, $sEnd) = split('\t', $starts{$parent}); my ($eStart, $eEnd) = split('\t', $ends{$parent}); my $startSize = $sEnd - $sStart; my $endSize = $eEnd - $eStart; if ($eStart < $sStart) { $chrStart = $eStart; $chrEnd = $sEnd; $blockStart = $sStart - $chrStart; $strand = "-"; $startSize = $eEnd - $eStart; $endSize = $sEnd - $sStart; } else { $chrStart = $sStart; $chrEnd = $eEnd; $blockStart = $eStart - $chrStart; } if ($startSize > $blockStart) { printf STDERR "# startSize > blockStart $endNames{$parent}\n"; } else { printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart; } } } } '_EOF_' # << happy emacs chmod +x mm10.pl # process GFF files into bed files into separateLibs/ directory for L in `cat library.list.txt` do export db="`pwd -P | awk -F'/' '{print $5}'`" export destDir="separateLibs/${L}" echo "working: ${L}" mkdir -p "${destDir}" ./${db}.pl reports/${L}.GCF_000001635.22.103.*.gff \ 2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/${db}.${L}.bed sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/${db}.${L}.items.bed6 done # use only those libraries with more than 20,000 clone ends wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \ | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list # note those libraries with less than 20,000 clone ends wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list # filter out bad ends, length must be <= median size times three cat lis.over20K.list | while read L do if [ ! -s separateLibs/${L}/lengths.txt ]; then awk '{print $3-$2}' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/lengths.txt fi median3X=`ave separateLibs/${L}/lengths.txt | grep median | awk '{printf "%d", $2*3}'` awk '($3-$2) < '$median3X'' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/mm10.median3X.bed awk '($3-$2) >= '$median3X'' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/mm10.badMap.bed before=`cat separateLibs/${L}/mm10.${L}.bed | wc -l` after=`cat separateLibs/${L}/mm10.median3X.bed | wc -l` dropped=`echo $before $after | awk '{print $1-$2}'` perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'` echo "$L $before - $after = $dropped -> % $perCent dropped" done # B6Ng01 96548 - 95837 = 711 -> % 0.74 dropped # C3H 42705 - 42378 = 327 -> % 0.77 dropped # CH29 51200 - 50621 = 579 -> % 1.13 dropped # DN 101826 - 100472 = 1354 -> % 1.33 dropped # MHPN 59859 - 58582 = 1277 -> % 2.13 dropped # MHPP 29074 - 28550 = 524 -> % 1.80 dropped # MSMg01 81802 - 78772 = 3030 -> % 3.70 dropped # RP23 83424 - 83062 = 362 -> % 0.43 dropped # RP24 51112 - 50849 = 263 -> % 0.51 dropped # WI1 326662 - 324259 = 2403 -> % 0.74 dropped # bMQ 73519 - 72540 = 979 -> % 1.33 dropped # loading the median3X files mkdir -p filteredEnds for L in `cat libs.over20K.list` do echo $L 1>&2 hgLoadBed -type=bed12 mm10 cloneEnd_${L} \ separateLibs/${L}/mm10.median3X.bed \ > filteredEnds/loadBed.${L}.log 2>&1 done # construct multiple mapped ends: cat separateLibs/*/mm10.median3X.bed | cut -f4 | sort | uniq -c | sort -rn > allEnds.names.count.txt awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' | sort > multiples.names.txt cat separateLibs/*/mm10.median3X.bed | sort -k4 > allEnds.nameSorted.bed join -t' ' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" -2 4 multiples.names.txt allEnds.nameSorted.bed | sort -k1,1 -k2,2n > allEnds.multiple.locations.bed hgLoadBed -type=bed12 mm10 cloneEnd_multipleMaps \ allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1 # construct bad mapped ends: mkdir -p filteredDroppedEnds for L in `cat libs.over20K.list` do echo $L 1>&2 cat separateLibs/${L}/mm10.badMap.bed done | sort -k1,1 -k2,2n > filteredDroppedEnds/badEnds.bed hgLoadBed -type=bed12 mm10 cloneEndbadEnds filteredDroppedEnds/badEnds.bed \ > filteredDroppedEnds/loadBed.badEnds.log 2>&1 # construct coverage bigWig files: cat separateLibs/*/mm10.median3X.bed | awk '$6 == "+"' | sort -k1,1 -k2,2n \ | bedItemOverlapCount mm10 stdin > allEnds.forward.bedGraph cat separateLibs/*/mm10.median3X.bed | awk '$6 == "-"' | sort -k1,1 -k2,2n \ | bedItemOverlapCount mm10 stdin > allEnds.reverse.bedGraph bedGraphToBigWig allEnds.forward.bedGraph /hive/data/genomes/mm10/chrom.sizes \ cloneEnd_coverageForward.bw bedGraphToBigWig allEnds.reverse.bedGraph /hive/data/genomes/mm10/chrom.sizes \ cloneEnd_coverageReverse.bw mkdir /gbdb/mm10/bbi/cloneEnd ln -s `pwd`/cloneEnd_coverageForward.bw /gbdb/mm10/bbi/cloneEnd ln -s `pwd`/cloneEnd_coverageReverse.bw /gbdb/mm10/bbi/cloneEnd hgBbiDbLink mm10 cloneEnd_coverageForward \ /gbdb/mm10/bbi/cloneEnd/cloneEnd_coverageForward.bw hgBbiDbLink mm10 cloneEnd_coverageReverse \ /gbdb/mm10/bbi/cloneEnd/cloneEnd_coverageReverse.bw ### Fixup the scores to indicate how many multiple mappings as mentioned ### in the hg19 bacEnds description page: one mapping: score = 1000 ### multiple mappings: score = 1500/count ### the sort | uniq -c | awk does this score calculation with the name ### in column 1 ### The join puts the existing table together with those scores ### DONE - 2016-03-02 - Hiram mkdir /hive/data/genomes/mm10/bed/cloneEnds/addCounts cd /hive/data/genomes/mm10/bed/cloneEnds/addCounts mkdir score withScore noScore withScore for table in cloneEndB6Ng01 cloneEndC3H cloneEndCH29 cloneEndDN \ cloneEndMHPN cloneEndMHPP cloneEndMSMg01 cloneEndRP23 cloneEndRP24 \ cloneEndWI1 cloneEndbMQ cloneEndbadEnds cloneEndmultipleMaps do hgsql -N -e "select name from $table;" mm10 | sort | uniq -c | awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \ | sort > score/mm10.$table.score.tab hgsql -N -e "select * from $table order by name;" mm10 \ | sort -k5 > noScore/mm10.$table.tab join -t' ' -1 5 noScore/mm10.$table.tab score/mm10.$table.score.tab \ | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \ | sort -k2,2 -k3,3n > withScore/mm10.$table.withScore.tab hgsql -e "delete from $table;" mm10 hgsql -e "load data local infile \"withScore/mm10.$table.withScore.tab\" into table $table;" mm10 done for table in cloneEndB6Ng01 cloneEndC3H cloneEndCH29 cloneEndDN \ cloneEndMHPN cloneEndMHPP cloneEndMSMg01 cloneEndRP23 cloneEndRP24 \ cloneEndWI1 cloneEndbMQ cloneEndbadEnds cloneEndmultipleMaps do hgsql -N -e "select count(*) from $table;" mm10 | cat done # 95837 # 42378 # 50621 # 100472 # 58582 # 28550 # 78772 # 83062 # 50849 # 324259 # 72540 # 11809 # 4269 ############################################################################## # 2014-08-17: import of UCSC GENCODE group processing of GENCODE VM3 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM3/data cd /hive/data/genomes/mm10/bed/gencodeVM3 # download gencode release cd data wget -nv -r -np ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M3 mv ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M3 . rm -rf ftp.sanger.ac.uk cd .. # create Makefile from previous one. cp /hive/data/genomes/mm10/bed/gencodeVM2/Makefile . # build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM3 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM2.ra mouse/mm10/wgEncodeGencodeVM3.ra cp mouse/mm10/wgEncodeGencodeVM2.html mouse/mm10/wgEncodeGencodeVM3.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM3.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM3", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode VM?' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM3 make joinerCheck # output in check/joiner.out ############################################################################## # LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve) mkdir /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 cd /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 cat << '_EOF_' > DEF # mouse vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/nib SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau8 SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 181m30.700s cat fb.mm10.chainBosTau8Link.txt # 698722925 bases of 2652783500 (26.339%) in intersection # Create link cd /hive/data/genomes/mm10/bed ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 # and the swap mkdir /hive/data/genomes/bosTau8/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau8/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 58m4.272s cat fb.bosTau8.chainMm10Link.txt # 687270584 bases of 2649307237 (25.942%) in intersection # Create link cd /hive/data/genomes/bosTau8/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################ # 2014-12-05: import of UCSC GENCODE group processing of GENCODE VM4 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM4/data cd /hive/data/genomes/mm10/bed/gencodeVM4 # create Makefile from previous one. # WARNING: next build start with hg/makeDb/outside/gencode/gencodeLoad.mk cp /hive/data/genomes/hg38/bed/gencodeV21/Makefile . # download, build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM4 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM2.ra mouse/mm10/wgEncodeGencodeVM4.ra cp mouse/mm10/wgEncodeGencodeVM2.html mouse/mm10/wgEncodeGencodeVM4.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM4.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM4", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode VM?' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM4 make joinerCheck # output in check/joiner.out ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## # LASTZ mouse/mm10 sheep/oviAri3 - (DONE - 2015-01-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 cd /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 cp -p \ /hive/users/hiram/multiz/100way/mm10.oviAri3/mm10.oviAri3.tuning.top400.txt \ ./mm10.oviAri3.tuning.Q.txt cat << '_EOF_' > DEF # mouse vs sheep # parameters obtained from a tuning run of lastz_D # /hive/users/hiram/multiz/100way/mm10.oviAri3/mm10.oviAri3.tuning.top400.txt BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz BLASTZ_T=2 BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=254 BLASTZ_X=890 BLASTZ_Y=3400 BLASTZ_Q=/hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08/mm10.oviAri3.tuning.Q.txt # A C G T # A 89 -172 -40 -184 # C -172 100 -121 -40 # G -40 -121 100 -172 # T -184 -40 -172 89 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: sheep oviAri3 SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 75m27.412s cat fb.mm10.chainOviAri3Link.txt # 432006690 bases of 2652783500 (16.285%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 oviAri3) > rbest.log 2>&1 & # real 17m24.577s # and for the swap: mkdir /hive/data/genomes/oviAri3/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 31m27.481s cat fb.oviAri3.chainMm10Link.txt #422549165 bases of 2534335866 (16.673%) in intersection time (doRecipBest.pl -buildDir=`pwd` oviAri3 mm10) > rbest.log 2>&1 # real 16m45.956s ######################################################################### # RETROFINDER RETROPOSED GENES ucscRetro track VERSION 6 # (2015-01-02 - 2015-01-07, hartera, DONE) ssh hgwdev mkdir -p /hive/hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " RUNDATE="2015-01-02" DB=mm10 SCORETHRESH=510 GENOMENAME='Mus musculus' GBDB=mm DATE=20150102 VERSION=6 RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin KENTDIR=/cluster/home/hartera/kent KENTBINDIR=/cluster/bin/x86_64 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION TMPMRNA=$RUNDIR/mrnaBlastz/$DB TMPEST=$RUNDIR/est/$DB USEALTSEQS=0 EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 LASTZPROG=/cluster/bin/penn/x86_64/lastz SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline GENOME=/hive/data/genomes TWOBIT=$GENOME/$DB/$DB.2bit RETRODIR=$GENOME/$DB/bed/retro BASE=$RUNDIR/retro BASE=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/retro OUTDIR=${BASE}/version${VERSION}/${DB} RESULT=$OUTDIR/result RESULTSPLIT=$OUTDIR/resultSplit LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o TABLE=ucscRetroInfo$VERSION ORTHOTABLE=ucscRetroOrtho$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB NIB=$LOCAL/nib RMSK=rmsk NET1=netHg38 NET2=netCanFam3 NET3=netRn5 GENE1=knownGene GENE2=refGene GENE3=wgEncodeGencodeCompVM4 CLUSTER=ku SPECIES="hg38 mm10" ROOTDIR="~/public_html/retro/mm10Jul14" WEBROOT=$ROOTDIR/retro.$RUNDATE WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu SHUFFLEDIR=shuffle SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR DUPDIR=dups DUPROOT=$WEBROOT/$DUPDIR AGEDIR=age AGEROOT=$WEBROOT/$AGEDIR EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE=affyGnf1m ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio ARRAYABS=hgFixed.gnfMouseAtlas2All ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps # ARRAYLOOKUP=knownToGnfAtlas2 #ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl" ALTSPLICE=sibTxGraph SPLITBYAGE=$SCRIPT/splitRetrosByAgeMouse PDB=proteins140122 BREAKS=0,8,16,24,32 XLIM=34 YLIM=0.1 YLIM1=4000 YLIM2=160 MAXDIVERGENCE=32 '_EOF_' # << happy emacs chmod +x DEF mkdir -p /hive/data/genomes/mm10/bed/retro mkdir -p /hive/data/genomes/mm10/bed/mrnaBlastz.6 cd /hive/data/genomes/mm10/bed/mrnaBlastz.6 # Create S1.len file foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y) echo $c hgsql -Ne "select chrom, size from chromInfo where chrom='chr${c}';" mm10 \ >> S1.len end # NOTE: in future, use /hive/data/genomes/mm10/chrom.sizes for S1.len # and just remove randoms and chrM. cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 mkdir mrnaBlastz cd mrnaBlastz cp ../DEF . cp /hive/data/genomes/mm10/bed/mrnaBlastz.6/S1.len . screen # Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree: retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF # check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF #check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF #check cluster jobs on ku # Load the track retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102/retro/version6/mm10 retroFinder/branches/version2/src/pipeline/filterMrna.sh retroFinder/branches/version2/src/pipeline/filterEst.sh # Check cluster jobs on ku retroFinder/branches/version2/src/pipeline/analyseExpress.sh # Check cluster jobs on ku #added ucscRetroAli6 to kent/src/hg/makeDb/mouse/mm10/trackDb.ra # copied # /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102/retro/version6/mm10/trackDb.retro # entry to kent/src/hg/makeDb/trackDb/mouse/mm10/trackDb.ra # and edited it to remove the full data and add: # dataVersion Jan. 2015 # Scripts copied ucscRetroAli6.psl, ucscRetroInfo6.bed and ucscRetroCds6.tab # to /hive/data/genomes/mm10/bed/retro/ ############################################################################## # LASTZ mouse/mm10 sheep/tarSyr2 - (DONE - 2015-03-27 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 cd /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 cat << '_EOF_' > DEF # tarsier vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Tarsier TarSyr2 SEQ2_DIR=/hive/data/genomes/tarSyr2/tarSyr2.2bit SEQ2_LEN=/hive/data/genomes/tarSyr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 301m17.238s time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=syntenicNet -syntenicNet -workhorse=hgwdev \ -smallClusterHub=ku -bigClusterHub=ku) > synNet.log 2>&1 # real 16m5.061s cat fb.mm10.chainTarSyr2Link.txt # 856877439 bases of 2652783500 (32.301%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 tarSyr2) > rbest.log 2>&1 & # real 27m4.048s # and for the swap: mkdir /hive/data/genomes/tarSyr2/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 181m7.042s cat fb.tarSyr2.chainMm10Link.txt # 900229088 bases of 3405755564 (26.433%) in intersection time (doRecipBest.pl -buildDir=`pwd` tarSyr2 mm10) > rbest.log 2>&1 # real 77m29.742s ######################################################################### # UCSC to RefSeq name correspondence (DONE - 2015-04-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/ucscToRefSeq cd /hive/data/genomes/mm10/bed/ucscToRefSeq rsync -avPL \ rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCA_000001635.5_GRCm38.p3/GCA_000001635.5_GRCm38.p3_assembly_report.txt ./ # this assembly_report has "UCSC-style-name" in column 10 # but it does not name everything # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" GCA_000001635.5_GRCm38.p3_assembly_report.txt \ | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab # chrM/MT confusion fixed by sed hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' mm10 \ | sed -e 's/NC_005089.1/AY172335.1/;' | sort > insdc.ucsc.tab join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \ | cut -f2- > ucsc.refSeq.tab export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab mm10 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab checkTableCoords mm10 -table=ucscToRefSeq featureBits -countGaps mm10 ucscToRefSeq # 2730871774 bases of 2730871774 (100.000%) in intersection # fixup 2016-04-11 - Hiram # the chrM name is not correct, it was RefSeq instead of Genbank/INSDC: hgsql -e 'select * from ucscToINSDC where name="NC_005089.1";' mm10 +-------+------------+----------+-------------+ | chrom | chromStart | chromEnd | name | +-------+------------+----------+-------------+ | chrM | 0 | 16299 | NC_005089.1 | +-------+------------+----------+-------------+ hgsql -e 'update ucscToINSDC set name="AY172335.1" where name="NC_005089.1";' mm10 hgsql -e 'select * from ucscToINSDC where name="AY172335.1";' mm10 +-------+------------+----------+------------+ | chrom | chromStart | chromEnd | name | +-------+------------+----------+------------+ | chrM | 0 | 16299 | AY172335.1 | +-------+------------+----------+-------------+ ######################################################################### # download and load ncbiGene track ( DONE - 2015-06-09 - Brian) db=mm10 mkdir /cluster/data/genomes/$db/bed/ncbiGene cd /cluster/data/genomes/$db/bed/ncbiGene ftpFile=ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus/GFF/ref_GRCm38.p3_top_level.gff3.gz gff3File=`basename $ftpFile` echo "select * from ucscToRefSeq" | hgsql $db | tail -n +2 | awk '{print 0, $4, $3, $1, $3}' > refSeqToUcsc.lft rm -f $ftpFile wget $ftpFile /cluster/home/braney/bin/x86_64/gff3ToGenePred -useName -warnAndContinue -attrsOut=attrs -bad=bad.gp $gff3File stdout 2> convertErr.txt | liftUp -type=.gp -extGenePred lift.gp refSeqToUcsc.lft warn stdin 2> liftErr.txt wc -l lift.gp # 108567 lift.gp wc -l bad.gp # 189 tawk '{print $1}' attrs | sort | uniq > meta wc -l meta # 110847 meta for i in product Dbxref gene gbkey do echo $i tawk -v attr=$i '$2==attr {print $1,$3}' attrs | sort | uniq | join -t $'\t' /dev/stdin meta > out mv out meta done wc -l meta # 109420 meta egrep "^N(M|R|P)" lift.gp > curated.gp egrep "^X(M|R)" lift.gp > predicted.gp wc -l curated.gp predicted.gp #33545 curated.gp #70587 predicted.gp #104132 total cat curated.gp predicted.gp | awk '{print $1}' | sort -u > tmp1 cat meta | awk '{print $1}' | sort -u > tmp2 join -v 1 tmp1 tmp2 | wc -l # 0 grep dropping convertErr.txt | wc -l # 189 awk '/isn/ {print $1}' liftErr.txt | sort -u # NT_166322.1 # NT_187001.1 hgLoadGenePred -genePredExt $db ncbiRefCurated curated.gp hgLoadGenePred -genePredExt $db ncbiRefPredicted predicted.gp hgLoadSqlTab $db ncbiRefLink $kent/src/hg/lib/ncbiRefLink.sql meta hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("mm10", "ncbiRefSeq", "braney", "105", now(), \ "http://www.ncbi.nlm.nih.gov/genome/annotation_euk/Mus_musculus/105/", \ "ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus", \ "9 February 2015" );' hgFixed # ############################################################################# # hgPal downloads (DONE braney 2015-06-02) # CDS FASTA from 60-way for knownGene ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal cd /hive/data/genomes/mm10/bed/multiz60way/pal cat ../species.list | tr '[ ]' '[\n]' > order.lst export mz=multiz60way export gp=knownGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.lst stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.lst stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time nice sh -x $gp.jobs > $gp.jobs.log 2>&1 & # real 80m36.763s mz=multiz60way gp=knownGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m16.821s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ # ############################################################################# # hgPal downloads (DONE jcasper 2016-06-22) # CDS FASTA from 60-way for knownGene - rebuilt for mm10 ucsc genes v16 ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc16 cd /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc16 cat ../species.list | tr '[ ]' '[\n]' > order.lst export mz=multiz60way export gp=knownGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.lst stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.lst stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time nice sh -x $gp.jobs > $gp.jobs.log 2>&1 # real 87m59.962s mz=multiz60way gp=knownGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m48.725s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd rm -f $pd/$gp.exonAA.fa.gz $pd/$gp.exonNuc.fa.gz $pd/md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz (cd $pd && md5sum *.fa.gz) > md5sum.txt ln -s `pwd`/md5sum.txt $pd/ ########################################################################### # GENEID GENE PREDICTIONS (DONE - 2015-06-26 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/geneid cd /hive/data/genomes/mm10/bed/geneid wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/00README wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/mm10.geneid.prot wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/mm10.geneid.gtf ldHgGene -gtf -genePredExt mm10 geneid mm10.geneid.gtf # Read 36771 transcripts in 287332 lines in 1 files # 36771 groups 66 seqs 1 sources 3 feature types # 36771 gene predictions featureBits -enrichment mm10 refGene:CDS geneid # refGene:CDS 1.292%, geneid 1.584%, both 1.028%, cover 79.51%, enrich 50.19x featureBits -enrichment mm9 refGene:CDS geneid # refGene:CDS 1.305%, geneid 1.590%, both 1.040%, cover 79.65%, enrich 50.11x featureBits -countGaps mm10 geneid # 42028722 bases of 2730871774 (1.539%) in intersection featureBits -countGaps mm9 geneid # 41651898 bases of 2725765481 (1.528%) in intersection ########################################################################## # SGP GENES (DONE - 2015-07-30 - Hiram) mkdir /hive/data/genomes/mm10/bed/sgpGene cd /hive/data/genomes/mm10/bed/sgpGene wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/00README wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/mm10.sgp2.gtf wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/mm10.sgp2.gff3 ldHgGene -gtf -genePredExt mm10 sgpGene mm10.sgp2.gtf # Read 35235 transcripts in 287314 lines in 1 files # 35235 groups 60 seqs 1 sources 3 feature types # 35235 gene predictions featureBits -enrichment mm10 refGene:CDS sgpGene # refGene:CDS 1.292%, sgpGene 1.430%, both 1.101%, cover 85.21%, enrich 59.59x featureBits -enrichment mm9 refGene:CDS sgpGene # refGene:CDS 1.305%, sgpGene 1.439%, both 1.113%, cover 85.23%, enrich 59.23x ######################################################################### 2015-06-29-13: import of UCSC GENCODE group processing of GENCODE VM5 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM5 cd /hive/data/genomes/mm10/bed/gencodeVM5 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set # release and transcript support versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM5 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp human/mm10/wgEncodeGencodeV18.ra human/mm10/wgEncodeGencodeVM5.ra cp human/mm10/wgEncodeGencodeV18.html human/mm10/wgEncodeGencodeVM5.html # edit these plus human/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM5.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM5 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # lastz zebrafish danRer10 (DONE - 2015-09-11 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DanRer10 mkdir /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 cd /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 cat << '_EOF_' > DEF # Mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebrafish danRer10 SEQ2_DIR=/hive/data/genomes/danRer10/danRer10.2bit SEQ2_LEN=/hive/data/genomes/danRer10/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 TMPDIR=/dev/shm '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 198m3.073s cat fb.mm10.chainDanRer10Link.txt # 73464192 bases of 2652783500 (2.769%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 danRer10) > rbest.log 2>&1 & # real 7m8.599s # and for the swap mkdir /hive/data/genomes/danRer10/bed/blastz.mm10.swap cd /hive/data/genomes/danRer10/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 16m8.387s cat fb.danRer10.chainMm10Link.txt # 71611488 bases of 1369683683 (5.228%) in intersection time (doRecipBest.pl -buildDir=`pwd` danRer10 mm10) > rbest.log 2>&1 # real 7m34.259s ######################################################################### 2015-10-02: import of UCSC GENCODE group processing of GENCODE VM7 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM7 cd /hive/data/genomes/mm10/bed/gencodeVM7 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set # release and transcript support versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # ## Copy and update trackDb files from previous release.
## Change version and use lower priority so it sorts to top of
## super track page. Follow instructiuons in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM4.ra mouse/mm10/wgEncodeGencodeVM7.ra cp mouse/mm10/wgEncodeGencodeVM4.html mouse/mm10/wgEncodeGencodeVM7.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM7.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM7 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # DBSNP 142 / SNP142 (DONE 2015-11-20 braney) # RedMine #15934 screen -S mm10dbSnp mkdir -p /hive/data/outside/dbSNP/142/mouse_mm10 cd /hive/data/outside/dbSNP/142/mouse_mm10 # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 142 buildAssembly refAssemblyLabel GRCm38.p2 ncbiAssemblyReportFile GCF_000001635.22.assembly.txt ignoreDbSnpContigsFile dbSnpContigsNotInUcsc.txt liftUp suggested.lft EOF #actually ran the script a few times to get the above config.ra with values suggested ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log tail -f do.log # *** All done! ############################################################################## # FILTER SNP142 (DONE 2015-11-21 braney) cd /hive/data/outside/dbSNP/142/mouse_mm10 zcat snp142.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 3276456 #Common: 8213470 #Flagged: 0 #leftover: 70731318 foreach f ({Mult,Common}.bed.gz) mv $f snp142$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp142$subset -sqlTable=snp142.sql snp142$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (142) (DONE 2015-11-21 braney) cd /hive/data/outside/dbSNP/142/mouse-mm10 # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need
# to add 1 to the start coord. "\n"; To date we have only offered substs for download. It will just take longer. time sort -k4,4 -S10G --parallel=20 mm10.bed > mm10.s4.bed join -t $'\t' -1 4 -2 1 ../data/mm10.s4.bed ../data/seqAndPatentSummary.tab -o '1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 1.12 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.10 2.11 2.12' | patSeqFilterBulkAndAnnotate ../data/htPatents.txt patBulk.bed patNonBulk.bed -c ../data/seqCounts.tab bedSort patNonBulk.bed patNonBulk.bed bedSort patBulk.bed patBulk.bed bedToBigBed patNonBulk.bed /cluster/data/genomes/mm10/chrom.sizes patNonBulk.bb -tab -as=../patSummary.as -type=bed12+ bedToBigBed patBulk.bed /cluster/data/genomes/mm10/chrom.sizes patBulk.bb -tab -as=../patSummary.as -type=bed12+ hgBbiDbLink hg19 patBulk /gbdb/hg19/bbi/patBulk.bb hgBbiDbLink hg19 patNonBulk /gbdb/hg19/bbi/patNonBulk.bb ######################################################################### 2016-03-14: import of UCSC GENCODE group processing of GENCODE VM8 (markd) # not to be pushed to RR # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM8 cd /hive/data/genomes/mm10/bed/gencodeVM8 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM7.ra mouse/mm10/wgEncodeGencodeVM8.ra cp mouse/mm10/wgEncodeGencodeVM7.html mouse/mm10/wgEncodeGencodeVM8.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM8.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM8 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### 2016-03-14: import of UCSC GENCODE group processing of GENCODE VM9 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM9 cd /hive/data/genomes/mm10/bed/gencodeVM9 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM8.ra mouse/mm10/wgEncodeGencodeVM9.ra cp mouse/mm10/wgEncodeGencodeVM8.html mouse/mm10/wgEncodeGencodeVM9.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM9.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM9 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################## # LASTZ Rat rn6 (DONE - 2016-04-09 - Jonathan) mkdir /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 cd /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 printf '# rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn6 SEQ2_DIR=/hive/data/genomes/rn6/rn6.2bit SEQ2_LEN=/hive/data/genomes/rn6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10Rn6 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=medium) > do.log 2>&1 # real 501m43.495s cat fb.mm10.chainRn6Link.txt # 1880453869 bases of 2652783500 (70.886%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 rn6) > rbest.log 2>&1 & # real 766m50.090s mkdir /hive/data/genomes/rn6/bed/blastz.mm10.swap cd /hive/data/genomes/rn6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=medium) > swap.log 2>&1 # real 234m59.393s cat fb.rn6.chainMm10Link.txt # 1938597957 bases of 2729860805 (71.015%) in intersection time (doRecipBest.pl -buildDir=`pwd` rn6 mm10) > rbest.log 2>&1 # real 882m38.624s ######################################################################### ## 4-Way Multiz for UCSC Genes construction (TBD - 2016-04-06 - Jonathan) # mm10, hg38, canFam3, rn6 mkdir /hive/data/genomes/mm10/bed/multiz4way cd /hive/data/genomes/mm10/bed/multiz4way # extract a tree for the 4 we need /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,mm10,canFam3,rn6 $HOME/kent/src/hg/utils/phyloTrees/191way.nh > 4way.nh # this looks like: ((hg38:0.145908,(mm10:0.084509,rn6:0.091589):0.271974):0.020593,canFam3:0.165928); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/mm10_4way.gif /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt # Use this output to create the table below grep -i mm10 4way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainMm10Link chain linearGap # distance on mm10 on other minScore # 1 0.176098 - rat rn6 (% 70.886) (% 71.015) 5000 medium # 2 0.502391 - human hg38 (% 35.372) (% 31.653) 3000 medium # 3 0.543004 - dog canFam3 (% 29.144) (% 31.624) 3000 medium # using the syntenic nets cd /cluster/data/mm10/bed/multiz4way mkdir mafLinks cd mafLinks mkdir rn6 canFam3 hg38 for D in hg38 canFam3 rn6 do cd $D ln -s ../../../lastz.${D}/mafSynNet/*.maf.gz ./ cd .. done # determine what is the newest version of multiz and use that cd /hive/data/genomes/mm10/bed/multiz4way mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # the autoMultiz cluster run ssh ku cd /hive/data/genomes/mm10/bed/multiz4way # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 4way.nh > tmp.nh echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this # script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = mm10 set c = $1 set maf = $2 set binDir = /hive/data/genomes/mm10/bed/multiz4way/penn set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/multiz4way/mafLinks rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /hive/data/genomes/mm10/bed/multiz4way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/mm10/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 66 jobs para try ... check ... push ... etc ... # Completed: 66 of 66 jobs # CPU time in finished jobs: 34495s 574.91m 9.58h 0.40d 0.001 y # IO & Wait Time: 826s 13.77m 0.23h 0.01d 0.000 y # Average job time: 535s 8.92m 0.15h 0.01d # Longest finished job: 2765s 46.08m 0.77h 0.03d # Submission to last job: 2776s 46.27m 0.77h 0.03d # combine results into a single file for loading and gbdb reference cd /hive/data/genomes/mm10/bed/multiz4way grep "^#" maf/chr1_GL456210_random.maf | grep -v "eof maf" > multiz4way.maf grep -h -v "^#" maf/*.maf >> multiz4way.maf grep "^#" maf/chr1_GL456210_random.maf | grep "eof maf" >> multiz4way.maf # makes a 6.5 Gb file: # -rw-rw-r-- 1 6928752890 Apr 12 10:18 multiz4way.maf # Load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz4way mkdir /gbdb/mm10/multiz4way ln -s /hive/data/genomes/mm10/bed/multiz4way/multiz4way.maf \ /gbdb/mm10/multiz4way # the hgLoadMaf generates huge tmp files, locate them in /dev/shm cd /dev/shm time nice -n +19 hgLoadMaf mm10 multiz4way # Loaded 5300158 mafs in 1 files from /gbdb/mm10/multiz4way # real 1m41.656s cd /hive/data/genomes/mm10/bed/multiz4way time (cat /gbdb/mm10/multiz4way/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=10000 \ -mergeGap=500 -maxSize=50000 mm10 multiz4waySummary stdin) # Created 1310955 summary blocks from 9774995 components and 5300158 mafs # real 2m27.913s mv /dev/shm/multiz4way.tab . # -rw-rw-r-- 1 277435502 Apr 12 12:11 multiz4way.tab # -rw-rw-r-- 1 59271980 Apr 12 12:16 multiz4waySummary.tab wc -l multiz4way*.tab # 5300158 multiz4way.tab # 1310955 multiz4waySummary.tab # 6611113 total ######################################################################### # LASTZ mouse/mm10 vs. chicken/galGal5 - (DONE - 2016-04-20 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 cd /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 printf "# Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal5 SEQ2_DIR=/hive/data/genomes/galGal5/galGal5.2bit SEQ2_LEN=/hive/data/genomes/galGal5/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 112m25.946s cat fb.mm10.chainGalGal5Link.txt # 102343350 bases of 2652783500 (3.858%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 galGal5) > rbest.log 2>&1 & # real 170m24.948s # and for the swap: mkdir /hive/data/genomes/galGal5/bed/blastz.mm10.swap cd /hive/data/genomes/galGal5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 12m17.175s cat fb.galGal5.chainMm10Link.txt # 95753452 bases of 1218501075 (7.858%) in intersection time (doRecipBest.pl -buildDir=`pwd` galGal5 mm10) > rbest.log 2>&1 # real 138m37.610s ######################################################################### # LASTZ mouse/mm10 vs. Malayan flying lemur/galVar1 - (DONE - 2016-04-26 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 cd /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 printf "# mouse vs Malayan flying lemur BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Malayan flying lemur galVar1 SEQ2_DIR=/hive/data/genomes/galVar1/galVar1.2bit SEQ2_LEN=/hive/data/genomes/galVar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 TMPDIR=/dev/shm " > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 340m23.106s cat fb.mm10.chainGalVar1Link.txt # 944876157 bases of 2652783500 (35.618%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 galVar1) \ > rbest.log 2>&1 & # real 694m27.183s # and for the swap: mkdir /hive/data/genomes/galVar1/bed/blastz.mm10.swap cd /hive/data/genomes/galVar1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 173m45.678s cat fb.galVar1.chainMm10Link.txt # 1008272821 bases of 2802917674 (35.972%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galVar1 mm10) \ > rbest.log 2>&1 # real 856m16.458s ######################################################################### # lastz Chinese softshell turtle pelSin1 (DONE - 2016-05-10 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PelSin1 mkdir /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 cd /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 printf '# Mouse vs. Chinese softshell turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Chinese softshell turtle pelSin1 SEQ2_DIR=/hive/data/genomes/pelSin1/pelSin1.2bit SEQ2_LEN=/hive/data/genomes/pelSin1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 156m43.981s cat fb.mm10.chainPelSin1Link.txt # 113023930 bases of 2652783500 (4.261%) in intersection # forgot to include syntenicNet: time (doBlastzChainNet.pl -verbose=2 \ -continue=syntenicNet -syntenicNet `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > synNet.log 2>&1 & # real 2m9.196s time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 pelSin1) \ > rbest.log 2>&1 & # real 221m37.947s # and for the swap mkdir /hive/data/genomes/pelSin1/bed/blastz.mm10.swap cd /hive/data/genomes/pelSin1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \ > swap.log 2>&1 # real 16m3.703s cat fb.pelSin1.chainMm10Link.txt # 102485355 bases of 2106639384 (4.865%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` pelSin1 mm10) \ > rbest.log 2>&1 # real 198m33.448s ######################################################################### # LASTZ mouse/mm10 Gorilla/panPan2 - (DONE - 2016-05-24 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 cd /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 printf '# mouse vs bonobo BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: bonobo panPan2 SEQ2_DIR=/hive/data/genomes/panPan2/panPan2.2bit SEQ2_LEN=/hive/data/genomes/panPan2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 360m9.534s cat fb.mm10.chainPanPan2Link.txt # 928638440 bases of 2652783500 (35.006%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 panPan2) \ > rbest.log 2>&1 & # real 765m26.648s # and for the swap: mkdir /hive/data/genomes/panPan2/bed/blastz.mm10.swap cd /hive/data/genomes/panPan2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 106m54.032s cat fb.panPan2.chainMm10Link.txt # 911279510 bases of 2725937399 (33.430%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panPan2 mm10) \ > rbest.log 2>&1 # real 620m0.039s ######################################################################### 2016-07-22: import of UCSC GENCODE group processing of GENCODE VM10 (markd) # will not be pushed to the RR. # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM10 pushd /hive/data/genomes/mm10/bed/gencodeVM10 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. Results are in gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM9.ra mouse/mm10/wgEncodeGencodeVM10.ra cp mouse/mm10/wgEncodeGencodeVM9.html mouse/mm10/wgEncodeGencodeVM10.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM10.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. [ONLY if it's going to be pushed] # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM10 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # LASTZ mouse/mm10 Chimp/panTro5 - (DONE - 2016-08-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 cd /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 printf '# mouse vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: chimp panTro5 SEQ2_DIR=/hive/data/genomes/panTro5/panTro5.2bit SEQ2_LEN=/hive/data/genomes/panTro5/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 273m27.335s cat fb.mm10.chainPanTro5Link.txt # 935711523 bases of 2652783500 (35.273%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 panTro5) \ > rbest.log 2>&1 & # real 624m28.225s # and for the swap: mkdir /hive/data/genomes/panTro5/bed/blastz.mm10.swap cd /hive/data/genomes/panTro5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 98m32.623s cat fb.panTro5.chainMm10Link.txt # 965636631 bases of 3132620660 (30.825%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panTro5 mm10) \ > rbest.log 2>&1 # real 560m21.432s ######################################################################### # Crispr track. See ../crisprTrack/README.txt (2016-09-15 max) # Command: doCrispr.sh mm10 ensGene ############################################################################## ######################################################################### 2016-10-27: import of UCSC GENCODE group processing of GENCODE VM11 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM11 cd /hive/data/genomes/mm10/bed/gencodeVM11 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM8.ra mouse/mm10/wgEncodeGencodeVM11.ra cp mouse/mm10/wgEncodeGencodeVM8.html mouse/mm10/wgEncodeGencodeVM11.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM11.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM11 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################## 2016-12-08: import of UCSC GENCODE group processing of GENCODE VM12 (markd) No being pushed to RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM12 cd /hive/data/genomes/mm10/bed/gencodeVM12 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv- check to see if sizes make sense ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM11.ra mouse/mm10/wgEncodeGencodeVM12.ra cp mouse/mm10/wgEncodeGencodeVM11.html mouse/mm10/wgEncodeGencodeVM12.html # edit these plus mouse/mm10/trackDb.ra # - set priorities in wgEncodeGencodeVM12.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # DID NOT UPDATE all.joiner SINCE NOT BEING PUSHED PUBLIC # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM12 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################################ # Mouse strains VCF (DONE - 2016-11-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/strainsVCF cd /hive/data/genomes/mm10/bed/strainsVCF # download files: wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.snps_all.dbSNP142.vcf.gz # change to UCSC chrom names: zcat mgp.v5.merged.snps_all.dbSNP142.vcf.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \ > ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf # need to fixup the chrom names in the header, extract the header: grep "^#" ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf > original.header.txt # copy that and edit it to fixup the names: cp original.header.txt ucscNames.header.txt # extract the lines not in the header grep -v "^#" ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf > ucscNames.notHeader.txt # put it back together: cat ucscName.header.txt ucscNames.notHeader.txt > ucsc.mgpV5MergedSNPsAlldbSNP142.vcf # tabix gzip (about 2 hours) export name="ucsc.mgpV5MergedSNPsAlldbSNP142.vcf" /cluster/bin/tabix-0.2.6/bgzip $name /cluster/bin/tabix-0.2.6/tabix -p vcf $name.gz.tbi # symlink to gbdb mkdir /gbdb/mm10/mouseStrains ln -s `pwd`/ucsc.mgpV5MergedSNPsAlldbSNP142.vcf.gz \ /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz ln -s `pwd`/ucsc.mgpV5MergedSNPsAlldbSNP142.vcf.gz.tbi \ /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz.tbi hgBbiDbLink mm10 strainSNPs /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz # trackDb entry in trackDb/mouse/mm10/trackDb.ra: track strainSNPs shortLabel Mouse SNPs longLabel Annotated SNPs from mouse strain comparison analysis group varRep type vcfTabix visibility hide hapClusterHeight 78 ############################################################################# # lastz turkey melGal5 (DONE - 2017-01-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelGal5 mkdir /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 cd /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 printf '# Mouse vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turkey melGal5 SEQ2_DIR=/hive/data/genomes/melGal5/melGal5.2bit SEQ2_LEN=/hive/data/genomes/melGal5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 160m46.030s cat fb.mm10.chainMelGal5Link.txt # 94675126 bases of 2652783500 (3.569%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 melGal5) \ > rbest.log 2>&1 & # real 379m35.317s # and for the swap mkdir /hive/data/genomes/melGal5/bed/blastz.mm10.swap cd /hive/data/genomes/melGal5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 31m37.466s cat fb.melGal5.chainMm10Link.txt # 81470789 bases of 1093044709 (7.454%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` melGal5 mm10) \ > rbest.log 2>&1 # real 356m16.099s ############################################################################# # LASTZ mouse/mm10 Pig-tailed macaque/macNem1 - (DONE - 2017-02-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 cd /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 printf '# mouse vs Pig-tailed macaque BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Pig-tailed macaque macNem1 SEQ2_DIR=/hive/data/genomes/macNem1/macNem1.2bit SEQ2_LEN=/hive/data/genomes/macNem1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 370m19.213s cat fb.mm10.chainMacNem1Link.txt # 918083212 bases of 2652783500 (34.608%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 macNem1) \ > rbest.log 2>&1 & # real 344m11.369s # and for the swap: mkdir /hive/data/genomes/macNem1/bed/blastz.mm10.swap cd /hive/data/genomes/macNem1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 65m14.074s cat fb.macNem1.chainMm10Link.txt # 905682728 bases of 2838503083 (31.907%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` macNem1 mm10) \ > rbest.log 2>&1 # real 321m2.285s ############################################################################# # LASTZ mouse/mm10 Angolan colobus/colAng1 - (DONE - 2017-02-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 cd /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 printf '# mouse vs Angolan colobus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Angolan colobus colAng1 SEQ2_DIR=/hive/data/genomes/colAng1/colAng1.2bit SEQ2_LEN=/hive/data/genomes/colAng1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 376m8.949s cat fb.mm10.chainColAng1Link.txt # 902325064 bases of 2652783500 (34.014%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 colAng1) \ > rbest.log 2>&1 & # real 343m38.692s # and for the swap: mkdir /hive/data/genomes/colAng1/bed/blastz.mm10.swap cd /hive/data/genomes/colAng1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 62m44.125s cat fb.colAng1.chainMm10Link.txt # 885418780 bases of 2679973137 (33.038%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` colAng1 mm10) \ > rbest.log 2>&1 # real 296m19.689s ############################################################################# # LASTZ mouse/mm10 Gray mouse lemur/micMur3 - (DONE - 2017-03-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 cd /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 printf '# mouse vs Gray mouse lemur BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=4 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Gray mouse lemur micMur3 SEQ2_DIR=/hive/data/genomes/micMur3/micMur3.2bit SEQ2_LEN=/hive/data/genomes/micMur3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 2192m13.661s cat fb.mm10.chainMicMur3Link.txt # 907817373 bases of 2652783500 (34.221%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 micMur3) \ > rbest.log 2>&1 & # real 522m5.587s # and for the swap: mkdir /hive/data/genomes/micMur3/bed/blastz.mm10.swap cd /hive/data/genomes/micMur3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 71m4.702s cat fb.micMur3.chainMm10Link.txt # 905011854 bases of 2386321975 (37.925%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` micMur3 mm10) \ > rbest.log 2>&1 # real 508m58.716s ############################################################################# # LASTZ mouse/mm10 Gray mouse lemur/tupChi1 - (DONE - 2017-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 cd /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 printf '# mouse vs Chinese tree shrew BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=4 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Chinese tree shrew tupChi1 SEQ2_DIR=/hive/data/genomes/tupChi1/tupChi1.2bit SEQ2_LEN=/hive/data/genomes/tupChi1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 224m24.608s cat fb.mm10.chainTupChi1Link.txt # 683463709 bases of 2652783500 (25.764%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 tupChi1) \ > rbest.log 2>&1 & # real 385m2.239s # and for the swap: mkdir /hive/data/genomes/tupChi1/bed/blastz.mm10.swap cd /hive/data/genomes/tupChi1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 71m4.702s cat fb.tupChi1.chainMm10Link.txt # 708757944 bases of 2706389135 (26.188%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` tupChi1 mm10) \ > rbest.log 2>&1 # real 508m10.564s ############################################################################# # LASTZ mouse/mm10 Chinese pangolin/manPen1 - (DONE - 2017-03-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 cd /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 printf '# Mouse vs. Chinese softshell turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Chinese pangolin manPen1 SEQ2_DIR=/hive/data/genomes/manPen1/manPen1.2bit SEQ2_LEN=/hive/data/genomes/manPen1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=180 BASE=/hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 404m9.925s cat fb.mm10.chainManPen1Link.txt # 724400544 bases of 2652783500 (27.307%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 manPen1) \ > rbest.log 2>&1 & # real 499m21.668s # and for the swap: mkdir /hive/data/genomes/manPen1/bed/blastz.mm10.swap cd /hive/data/genomes/manPen1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -swap \ /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > swap.log 2>&1 # real 71m4.702s cat fb.manPen1.chainMm10Link.txt # 710179682 bases of 1999066070 (35.526%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` manPen1 mm10) \ > rbest.log 2>&1 # real 495m7.361s ############################################################################# # LASTZ mouse/mm10 vs. Golden eagle/aquChr2 - (DONE - 2017-03-16 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 cd /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 printf "# Mouse vs. Golden eagle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Golden eagle aquChr2 SEQ2_DIR=/hive/data/genomes/aquChr2/aquChr2.2bit SEQ2_LEN=/hive/data/genomes/aquChr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 217m29.467s cat fb.mm10.chainAquChr2Link.txt # 105013175 bases of 2652783500 (3.959%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 aquChr2) \ > rbest.log 2>&1 & # real 196m24.435s # and for the swap: mkdir /hive/data/genomes/aquChr2/bed/blastz.mm10.swap cd /hive/data/genomes/aquChr2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 9m16.569s cat fb.aquChr2.chainMm10Link.txt # 89023131 bases of 1180019022 (7.544%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` aquChr2 mm10) \ > rbest.log 2>&1 # real 132m43.886s ######################################################################### # LASTZ bison bisBis1 (DONE - 2017-03-17 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 cd /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 printf '# Mouse vs. Bison BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: bison bisBis1 SEQ2_DIR=/hive/data/genomes/bisBis1/bisBis1.2bit SEQ2_LEN=/hive/data/genomes/bisBis1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=900 BASE=/hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 576m23.128s cat fb.mm10.chainBisBis1Link.txt # 688337604 bases of 2652783500 (25.948%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 bisBis1) \ > rbest.log 2>&1 & # real 430m48.078s # and the swap mkdir /hive/data/genomes/bisBis1/bed/blastz.mm10.swap cd /hive/data/genomes/bisBis1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 169m28.369s cat fb.bisBis1.chainMm10Link.txt # 682104798 bases of 2757854331 (24.733%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` bisBis1 mm10) \ > rbest.log 2>&1 # real 445m5.636s ############################################################################ # lastz frog xenTro9 (DONE - 2017-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenTro9 mkdir /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 cd /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 printf '# Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: frog xenTro9 SEQ2_DIR=/hive/data/genomes/xenTro9/xenTro9.2bit SEQ2_LEN=/hive/data/genomes/xenTro9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) \ > do.log 2>&1 & # real 806m23.459s cat fb.mm10.chainXenTro9Link.txt # 87053836 bases of 2652783500 (3.282%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 xenTro9) \ > rbest.log 2>&1 & # real 617m41.376s # and for the swap mkdir /hive/data/genomes/xenTro9/bed/blastz.mm10.swap cd /hive/data/genomes/xenTro9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \ > swap.log 2>&1 & # real 25m54.516s cat fb.xenTro9.chainMm10Link.txt # 90150612 bases of 1369865365 (6.581%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenTro9 mm10) \ > rbest.log 2>&1 & # real 597m52.740s ######################################################################### # lastz frog xenLae2 (DONE - 2017-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenLae2 mkdir /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 cd /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 printf '# Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: frog xenLae2 SEQ2_DIR=/hive/data/genomes/xenLae2/xenLae2.2bit SEQ2_LEN=/hive/data/genomes/xenLae2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) \ > do.log 2>&1 & # real 1044m10.115s cat fb.mm10.chainXenLae2Link.txt # 82272699 bases of 2652783500 (3.101%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 xenLae2) \ > rbest.log 2>&1 & # real 656m46.337s # and for the swap mkdir /hive/data/genomes/xenLae2/bed/blastz.mm10.swap cd /hive/data/genomes/xenLae2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 26m14.884s time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29/DEF \ -continue=syntenicNet -workhorse=hgwdev -smallClusterHub=ku \ -bigClusterHub=ku -syntenicNet -swap -chainMinScore=5000 \ -chainLinearGap=loose) > syntenicNet.log 2>&1 & # real 1m52.642s cat fb.xenLae2.chainMm10Link.txt # 116001603 bases of 2408724787 (4.816%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenLae2 mm10) \ > rbest.log 2>&1 & # real 746m4.542s ######################################################################### # lastz turtle chrPic2 (DONE - 2017-04-05 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChrPic2 mkdir /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 cd /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 printf '# Mouse vs. turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turtle chrPic2 SEQ2_DIR=/hive/data/genomes/chrPic2/chrPic2.2bit SEQ2_LEN=/hive/data/genomes/chrPic2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 865m16.816s # ku difficulties due to /dev/shm/ being full, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=cat -chainMinScore=5000 -chainLinearGap=loose) > cat.log 2>&1 & # real 13m13.959s # one big chain causing trouble, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose) > chainMerge.log 2>&1 & # real 11m47.232s cat fb.mm10.chainChrPic2Link.txt # 112560591 bases of 2652783500 (4.243%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 chrPic2) \ > rbest.log 2>&1 & # real 114m27.445s # and for the swap mkdir /hive/data/genomes/chrPic2/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 12m2.676s cat fb.chrPic2.chainMm10Link.txt # 106063993 bases of 2173204089 (4.881%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` chrPic2 mm10) \ > rbest.log 2>&1 & # real 110m9.546s ######################################################################### 2017-04-16: import of UCSC GENCODE group processing of GENCODE VM13 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM13 pushd /hive/data/genomes/mm10/bed/gencodeVM13 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M13 88 'March 2017' # edit mouse/mm10/trackDb.ra to add new .ra file include make DBS=mm10 # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit all.joiner to add ~/tmp/gencodeVM13.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM13 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all ############################################################################## # LASTZ Chinese hamster ovary cell line CHO-K1 criGriChoV1 # (DONE - 2017-04-13 - Hiram) # establish a screen to control this job screen -S mm10criGriChoV1 mkdir /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 cd /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 printf '# Chinese hamster ovary cell line vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chinese hamster ovary cell line CHO-K1 criGriChoV1 SEQ2_DIR=/hive/data/genomes/criGriChoV1/criGriChoV1.2bit SEQ2_LEN=/hive/data/genomes/criGriChoV1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=250 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -noDbNameCheck -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 575m28.254s cat fb.mm10.chainCriGriChoV1Link.txt # 1553371182 bases of 2652783500 (58.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 criGriChoV1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 732m16.081s mkdir /hive/data/genomes/criGriChoV1/bed/blastz.mm10.swap cd /hive/data/genomes/criGriChoV1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 157m21.977s cat fb.criGriChoV1.chainMm10Link.txt # 1513594461 bases of 2318132242 (65.294%) in intersection time (doRecipBest.pl -workhorse=hgwdev criGriChoV1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 769m8.998s ############################################################################## ## 4-Way Multiz (DONE - 2017-04-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way # from the 213-way in the source tree, select out the 5 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,galVar1,mm10,tupChi1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/213way.nh \ > mm10.4way.nh cat mm10.4way.nh # ((hg38:0.143908,(tupChi1:0.120000,galVar1:0.080000):0.054937):0.002000, mm10:0.356483); use strict; use warnings;

open (FH, "<4way.distances.txt") or die "can not read 4way.distances.txt";
my $count = 0;
while (my $line = <FH>) {
    chomp $line;
    my ($D, $dist) = split('"'"'\\s+'"'"', $line);
    my $chain = "chain" . ucfirst($D);
    my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . use warnings; open (FH, "<4way.distances.txt") or die "can not read 4way.distances.txt"; my $count = 0; while (my $line = <FH>) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\\%%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm10/fb.${D}.chainMm10Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\\%%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='"'"'$D'"'"';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %%02d %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); ' > sizeStats.pl chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on hg38 on other other species # 01 0.4934 (% 35.618) (% 35.972) - Malayan flying lemur galVar1 # 02 0.5024 (% 35.372) (% 31.653) - Human hg38 # 03 0.5334 (% 25.764) (% 26.188) - Chinese tree shrew tupChi1 # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. is it the only one ? Need to split of the maf file into individual
# maf files # << happy emacs this looks like:
# ((hg38:0.145908,(mm10:0.084509,rn6:0.091589):0.271974):0.020593,canFam3:0.165928); mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 26777, ncbiRefSeq: 107894, refGene: 36869, xenoRefGene: 179145, Mrnas: 5367574
# hg38: ensGene: 208239, knownGene: 197782, mgcGenes: 35305, ncbiRefSeq: 159322, refGene: 69527, xenoRefGene: 184852, Mrnas: 11481766
# tupChi1: refGene: 206, xenoRefGene: 343637, Mrnas: 50709
# galVar1: ncbiRefSeq: 41547, xenoRefGene: 499145, Mrnas: 0 tupChi1: checked: 30481 failed: 0 galVar1: checked: 23389 failed: 0 93916 knownGene.gp the annotated maf is: the sed of the gp file inserts the reference species in the chr name do not have the usual problem with fast jobs here, only 52 of them total 21 147 1081 printing out the 'new', the 'old' the 'difference' and percent difference 52 maf.list This is setup for multiple runs based on subsets, but only running
# the 'all' subset here.
# It triggers off of the current working directory
# $cwd:t which is the "grp" in this script. Running:
# all and vertebrates Create list of chunks real 0m12.570s time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m7.235s # -rw-rw-r-- 1 28670932 Apr 21 00:01 tmpMostConserved.bed # -rw-rw-r-- 1 29438194 Apr 21 00:02 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/all time hgLoadBed mm10 tupChi1PhastConsElements4way mostConserved.bed # Read 841312 elements of size 5 from mostConserved.bed # real 0m7.635s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits mm10 -enrichment knownGene:cds tupChi1PhastConsElements4way # knownGene:cds 1.333%, tupChi1PhastConsElements4way 4.368%, both 0.924%, # cover 69.30%, enrich 15.86x # real 0m8.883s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons4way.wigFix.gz) # real 13m32.808s # -rw-rw-r-- 1 1452731444 Apr 21 00:18 phastCons4way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons4way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/mm10/chrom.sizes \ phastCons4way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=19728: VmPeak: 12564976 kB # real 17m36.198s bigWigInfo phastCons4way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,285,833,964 # primaryIndexSize: 63,248,068 # zoomLevels: 10 # chromCount: 37 # basesCovered: 1,155,614,560 # mean: 0.166872 # min: 0.000000 # max: 1.000000 # std: 0.286694 # encode those files into wiggle data time (zcat downloads/phastCons4way.wigFix.gz \ | wigEncode stdin phastCons4way.wig phastCons4way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 phyloP for 5-way (DONE - 2017-04-20 - Hiram) Create template file
# file1 == $chr/$chunk/file name without .ss suffix Average job time: 1819s 30.31m 0.51h 0.02d Converted stdin, upper limit 0.72, lower limit -2.31 Q1 0.000068 appears to have an odd hole in the data just past X=0 ? real 0m0.772s echo "date" >> $gp.jobs echo "wait" >> $gp.jobs chmod +x knownGene.jobs time (./$gp.jobs) > $gp.jobs.log 2>&1 & # real 11m18.851s export mz=multiz4way export gp=knownGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m8.492s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 0m39.199s # -rw-rw-r-- 1 33908467 Apr 21 18:49 knownGene.multiz4way.exonAA.fa.gz # -rw-rw-r-- 1 55392688 Apr 21 18:49 knownGene.multiz4way.exonNuc.fa.gz export mz=multiz4way export gp=knownGene export db=mm10 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # construct download files for 5-way (DONE - 2017-04-21 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz4way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons4way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP4way mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads mkdir multiz4way phastCons4way phyloP4way cd multiz4way time cp -p ../../anno/mm10.4way.maf . # real 0m15.285s # -rw-rw-r-- 1 7580362629 Apr 20 22:27 mm10.4way.maf du -hsc * # 7.1G mm10.4way.maf time gzip *.maf # real 27m2.122s # -rw-rw-r-- 1 2040574809 Apr 20 22:27 mm10.4way.maf.gz du -hsc *.maf.gz # 2.0G mm10.4way.maf.gz ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/tupChi1Multiz4way # bash script #!/bin/sh export geneTbl="knownGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm10 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm10 tupChi1Multiz4way \ stdin stdout \ -orgs=/hive/data/genomes/mm10/bed/tupChi1Multiz4way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 12m55.050s md5sum *.maf.gz *.nh upstream*.gz README.txt >> md5sum.txt # some other symlinks were already made above # obtain the README.txt from tupChi1/multiz4way and update for this # situation ln -s `pwd`/upstream*.gz `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/tupChi1Multiz4way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh mm10.4way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh mm10.4way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m35.144s ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz4way du -hsc *.maf.gz ../../anno/mm10.4way.maf # 3.0G mm10.4way.maf.gz # 13G ../../anno/mm10.4way.maf # obtain the README.txt from tupChi1/multiz4way and update for this # situation ##################################################################### cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/phastCons4way ln -s ../../cons/all/downloads/phastCons4way.wigFix.gz \ ./mm10.phastCons4way.wigFix.gz ln -s ../../cons/all/phastCons4way.bw ./mm10.phastCons4way.bw ln -s ../../cons/all/all.mod ./mm10.phastCons4way.mod time md5sum *.gz *.mod *.bw > md5sum.txt # real 0m20.354s # obtain the README.txt from tupChi1/phastCons4way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons4way ##################################################################### cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/phyloP4way ln -s ../../consPhyloP/all/downloads/phyloP4way.wigFix.gz \ ./mm10.phyloP4way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod mm10.phyloP4way.mod ln -s ../../consPhyloP/all/phyloP4way.bw mm10.phyloP4way.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m12.264s # obtain the README.txt from tupChi1/phyloP4way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP4way ############################################################################# # wiki page for 5-way (DONE - 2017-04-21 - Hiram) mkdir /hive/users/hiram/bigWays/mm10.4way cd /hive/users/hiram/bigWays echo "mm10" > mm10.4way/ordered.list awk '{print $1}' /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4way.distances.txt \ >> mm10.4way/ordered.list # sizeStats.sh catches up the cached measurements required for data
# in the tables. They are usually already mostly done, only new
# assemblies will have updates. when you view the first one you enter, it will have links to the
# missing two. The fix is to eliminate
# the rest of these unprocessedRoots from PSL loaded file. hits 107479 of 107479 (100.000%) ok [ONLY if it's going to be pushed] # edit all.joiner to add ~/tmp/gencodeVM14.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM14 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all ############################################################################## # LASTZ zebrafish danRer11 (DONE - 2017-06-12 - Chris) # establish a screen to control this job screen -S mm10danRer11 mkdir /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 cd /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 printf '# mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: zebrafish danRer11 SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 289m42.628s cat fb.mm10.chainDanRer11Link.txt # 36448414 bases of 2652783500 (1.374%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 danRer11 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & mkdir /hive/data/genomes/danRer11/bed/blastz.mm10.swap cd /hive/data/genomes/danRer11/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 172m50.552s cat fb.danRer11.chainMm10Link.txt # 45558857 bases of 1674677181 (2.720%) in intersection 1589449878 bases of 2301325917 (69.067%) in intersection time (doRecipBest.pl -workhorse=hgwdev danRer11 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 846m34.982s ############################################################################## # LASTZ Killer whale orcOrc1 (DONE - 2017-06-15 - Hiram) # establish a screen to control this job screen -S mm10orcOrc1 mkdir /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 cd /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 printf '# killer whale vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Killer whale orcOrc1 SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 192m26.791s cat fb.mm10.chainOrcOrc1Link.txt # 832909116 bases of 2652783500 (31.398%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 orcOrc1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 276m44.875s mkdir /hive/data/genomes/orcOrc1/bed/blastz.mm10.swap cd /hive/data/genomes/orcOrc1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 72m53.064s cat fb.orcOrc1.chainMm10Link.txt # 809350350 bases of 2249582125 (35.978%) in intersection time (doRecipBest.pl -workhorse=hgwdev orcOrc1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 # real 214m50.810s ############################################################################## # LASTZ Baboon papAnu3 (DONE - 2017-06-21 - Hiram) # establish a screen to control this job screen -S mm10papAnu3 mkdir /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 cd /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 printf '# mouse vs. baboon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: baboon papAnu3 SEQ2_DIR=/hive/data/genomes/papAnu3/papAnu3.2bit SEQ2_LEN=/hive/data/genomes/papAnu3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=180 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 474m39.013s cat fb.mm10.chainPapAnu3Link.txt # 910628118 bases of 2652783500 (34.327%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 papAnu3 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 644m20.659s mkdir /hive/data/genomes/papAnu3/bed/blastz.mm10.swap cd /hive/data/genomes/papAnu3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 66m35.501s cat fb.papAnu3.chainMm10Link.txt # 897929517 bases of 2893270787 (31.035%) in intersection time (doRecipBest.pl -workhorse=hgwdev papAnu3 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 578m46.893s ############################################################################## # LASTZ pig susScr11 (DONE - 2017-07-31 - Hiram) # establish a screen to control this job screen -S mm10susScr11 mkdir /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 cd /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 printf '# mouse vs. pig BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: baboon susScr11 SEQ2_DIR=/hive/data/genomes/susScr11/susScr11.2bit SEQ2_LEN=/hive/data/genomes/susScr11/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=1 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 567m0.166s cat fb.mm10.chainSusScr11Link.txt # 731012356 bases of 2652783500 (27.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 susScr11 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 455m39.565s mkdir /hive/data/genomes/susScr11/bed/blastz.mm10.swap cd /hive/data/genomes/susScr11/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 61m6.153s cat fb.susScr11.chainMm10Link.txt # 715277290 bases of 2472073034 (28.934%) in intersection time (doRecipBest.pl -workhorse=hgwdev susScr11 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 358m15.340s ############################################################################## # lastz nile tilapia oreNil3 (DONE - 2017-07-31 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil3 mkdir /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 cd /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 printf '# Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: nile tilapia oreNil3 SEQ2_DIR=/hive/data/genomes/oreNil3/oreNil3.2bit SEQ2_LEN=/hive/data/genomes/oreNil3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 TMPDIR=/scratch/tmp ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 307m32.926s cat fb.mm10.chainOreNil3Link.txt # 54152663 bases of 2652783500 (2.041%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 oreNil3 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 243m27.139s # and for the swap mkdir /hive/data/genomes/oreNil3/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 8m5.590s cat fb.oreNil3.chainMm10Link.txt # 55291586 bases of 1009856516 (5.475%) in intersection time (doRecipBest.pl -workhorse=hgwdev oreNil3 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 230m56.580s ######################################################################### # crispr 10K track (DONE - Hiram - 2017-07-28) # this script was developed during this procedure, thus, the step-wise # procedures: mkdir /hive/data/genomes/mm10/bed/crispr.10K cd /hive/data/genomes/mm10/bed/crispr.10K time (~/kent/src/hg/utils/automation/doCrispr.pl \ -stop=guides -buildDir=`pwd` mm10 ensGene) > guides.log 2>&1 # real 78m39.898s # Completed: 99 of 99 jobs # CPU time in finished jobs: 12182s 203.04m 3.38h 0.14d 0.000 y # IO & Wait Time: 1076s 17.93m 0.30h 0.01d 0.000 y # Average job time: 134s 2.23m 0.04h 0.00d # Longest finished job: 181s 3.02m 0.05h 0.00d # Submission to last job: 4567s 76.12m 1.27h 0.05d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScores \ -stop=specScores -buildDir=`pwd` mm10 ensGene # Completed: 945820 of 1558824 jobs # CPU time in finished jobs: 352722192s 5878703.20m 97978.39h 4082.43d 11.185 y # IO & Wait Time: 1367298315s 22788305.25m 379805.09h 15825.21d 43.357 y # Average job time: 1819s 30.31m 0.51h 0.02d # Longest finished job: 8656s 144.27m 2.40h 0.10d # Submission to last job: 2172942s 36215.70m 603.60h 25.15d # after ku reboot, finishing: # Completed: 613973 of 613973 jobs # CPU time in finished jobs: 155165030s 2586083.83m 43101.40h 1795.89d 4.920 y # IO & Wait Time: 584008656s 9733477.60m 162224.63h 6759.36d 18.519 y # Average job time: 1204s 20.07m 0.33h 0.01d # Longest finished job: 8978s 149.63m 2.49h 0.10d # Submission to last job: 1137188s 18953.13m 315.89h 13.16d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=effScores \ -stop=effScores -buildDir=`pwd` mm10 ensGene # Completed: 13518 of 13518 jobs # CPU time in finished jobs: 6244711s 104078.52m 1734.64h 72.28d 0.198 y # IO & Wait Time: 32457s 540.95m 9.02h 0.38d 0.001 y # Average job time: 464s 7.74m 0.13h 0.01d # Longest finished job: 2373s 39.55m 0.66h 0.03d # Submission to last job: 15145s 252.42m 4.21h 0.18d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=offTargets \ -stop=offTargets -buildDir=`pwd` mm10 ensGene # Completed: 77942 of 77942 jobs # CPU time in finished jobs: 1397706s 23295.10m 388.25h 16.18d 0.044 y # IO & Wait Time: 313616s 5226.94m 87.12h 3.63d 0.010 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 9239s 153.98m 2.57h 0.11d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=load \ -stop=load -buildDir=`pwd` mm10 ensGene # real 235m41.378s ########################################################################## # FIXUP broken files (working - Max and Hiram - 2018-04,05) # Max generated a new specScores.tab, add in the chrM specScores # and make a unique set in a new specScores.tab file cd /hive/data/genomes/mm10/bed/crispr.10K/uniqSpecScores printf "targetSeq\tmitSpecScore\tofftargetCount\ttargetGenomeGeneLocus\n" \ > max.withChrM.specScores.tab grep -h -v targetSeq ../specScores.max.tab ../addChrM/specScores.tab \ | $HOME/bin/x86_64/gnusort -S100G --parallel=32 -u \ >> max.withChrM.specScores.tab # real 1m39.468s # this new file is much larger than before: # -rw-rw-r-- 1 3616703851 Jul 31 2017 withChrM.specScores.tab # -rw-rw-r-- 1 5580638498 May 15 14:55 max.withChrM.specScores.tab # Now generate a new crispr.bed and crispr.bb file mkdir /hive/data/genomes/mm10/bed/crispr.10K/maxBed cd /hive/data/genomes/mm10/bed/crispr.10K/maxBed # setup new inputs: ln -s ../addChrM/withChrM.allGuides.bed withChrM.allGuides.bed ln -s ../uniqSpecScores/max.withChrM.specScores.tab max.withChrM.specScores.tab ln -s ../addChrM/withChrM.effScores.tab withChrM.effScores.tab ln -s ../addChrM/withChrM.offtargets.offsets.tab withChrM.offtargets.offsets.tab ln -s ../addChrM/offTargets ./offTargets time (/cluster/software/bin/python \ /hive/data/outside/crisprTrack/scripts/createBigBed.py mm10 \ withChrM.allGuides.bed max.withChrM.specScores.tab \ withChrM.effScores.tab withChrM.offtargets.offsets.tab) > newBed.log 2>&1 # real 232m5.379s # -rw-rw-r-- 1 27947769791 May 15 17:55 crispr.bed # -rw-rw-r-- 1 6911180170 May 15 18:42 crispr.bb ############################################################################## # LASTZ Gorilla gorGor5 (DONE - 2017-08-04 - Hiram) # establish a screen to control this job screen -S mm10gorGor5 mkdir /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 cd /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 printf '# mouse vs. gorilla BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: gorilla gorGor5 SEQ2_DIR=/hive/data/genomes/gorGor5/gorGor5.2bit SEQ2_LEN=/hive/data/genomes/gorGor5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=130 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 170m18.102s cat fb.mm10.chainGorGor5Link.txt # 934147601 bases of 2652783500 (35.214%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 gorGor5 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 327m34.879s mkdir /hive/data/genomes/gorGor5/bed/blastz.mm10.swap cd /hive/data/genomes/gorGor5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 72m34.088s cat fb.gorGor5.chainMm10Link.txt # 990002546 bases of 3080431298 (32.138%) in intersection time (doRecipBest.pl -workhorse=hgwdev gorGor5 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 297m3.002s ############################################################################## # refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie # previously done 2017-08-08 by Chris E mkdir /hive/data/genomes/mm10/bed/refSeqFuncElems.2017-11-29 cd /hive/data/genomes/mm10/bed/refSeqFuncElems.2017-11-29 # NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be # folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by # doNcbiRefSeq.pl. wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Mus_musculus/GFF_interim/interim_GRCm38.p6_top_level_2017-09-26.gff3.gz # Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to mm10 chrom names hgsql mm10 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \ > refSeqToChrom.tab cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab # Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class) # to identify Functional Elements and swap in mm10 chrom names. # Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an # mm10 chrom. Use grep -f chrom.tab to filter out patch contig annotations. [ONLY if it's going to be pushed] 105516336 bases of 2652783500 (3.978%) in intersection 2017-09-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 cd /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 printf '# Ma_s night monkey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Ma_s night monkey AotNan1 SEQ2_DIR=/hive/data/genomes/aotNan1/aotNan1.2bit SEQ2_LEN=/hive/data/genomes/aotNan1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10AotNan1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 400m13.309s cat fb.mm10.chainAotNan1Link.txt # 889500682 bases of 2652783500 (33.531%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 aotNan1) \ > rbest.log 2>&1 & # real 352m12.077s mkdir /hive/data/genomes/aotNan1/bed/blastz.mm10.swap cd /hive/data/genomes/aotNan1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 68m48.755s cat fb.aotNan1.chainMm10Link.txt # 893851318 bases of 2714439490 (32.929%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` aotNan1 mm10) \ > rbest.log 2>&1 # real 383m10.761s ############################################################################## # LASTZ Hawaiian monk seal neoSch1 (DONE - 2017-09-25 - Hiram) # establish a screen to control this job screen -S mm10neoSch1 mkdir /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 cd /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 printf '# mouse vs. Hawaiian monk seal BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Hawaiian monk seal neoSch1 SEQ2_DIR=/hive/data/genomes/neoSch1/neoSch1.2bit SEQ2_LEN=/hive/data/genomes/neoSch1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 324m0.457s cat fb.mm10.chainNeoSch1Link.txt # 827926012 bases of 2652783500 (31.210%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 neoSch1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 307m18.396s cat fb.mm10.chainRBestNeoSch1Link.txt # 788489846 bases of 2652783500 (29.723%) in intersection mkdir /hive/data/genomes/neoSch1/bed/blastz.mm10.swap cd /hive/data/genomes/neoSch1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 59m27.809s cat fb.neoSch1.chainMm10Link.txt # 804021579 bases of 2400839308 (33.489%) in intersection cat fb.neoSch1.chainSynMm10Link.txt # 776155245 bases of 2400839308 (32.328%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev neoSch1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 242m31.157s cat fb.neoSch1.chainRBestMm10Link.txt # 787537751 bases of 2400839308 (32.803%) in intersection ############################################################################## # LASTZ Sooty mangabey cerAty1 (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 cd /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 printf '# Sooty mangabey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Sooty mangabey CerAty1 SEQ2_DIR=/hive/data/genomes/cerAty1/cerAty1.2bit SEQ2_LEN=/hive/data/genomes/cerAty1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CerAty1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 371m15.075s cat fb.mm10.chainCerAty1Link.txt # 917680202 bases of 2652783500 (34.593%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cerAty1) \ > rbest.log 2>&1 & # real 345m49.786s mkdir /hive/data/genomes/cerAty1/bed/blastz.mm10.swap cd /hive/data/genomes/cerAty1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 68m6.225s cat fb.cerAty1.chainMm10Link.txt # 903892923 bases of 2787289397 (32.429%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cerAty1 mm10) \ > rbest.log 2>&1 # real 305m14.804s ############################################################################## # LASTZ Coquerel's sifaka to mouse/Mm10 (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 cd /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 printf '# Coquerel_s sifaka vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: proCoq1 - Coquerel_s sifaka - Propithecus coquereli SEQ2_DIR=/hive/data/genomes/proCoq1/proCoq1.2bit SEQ2_LEN=/hive/data/genomes/proCoq1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10ProCoq1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 294m43.931s cat fb.mm10.chainProCoq1Link.txt # 882327683 bases of 2652783500 (33.260%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 proCoq1) \ > rbest.log 2>&1 & # real 411m5.774s mkdir /hive/data/genomes/proCoq1/bed/blastz.mm10.swap cd /hive/data/genomes/proCoq1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 62m48.333s cat fb.proCoq1.chainMm10Link.txt # 863635783 bases of 2083764538 (41.446%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` proCoq1 mm10) \ > rbest.log 2>&1 # real 357m54.198s ############################################################################## # LASTZ White-faced sapajou to mouse/Mm10 (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 cd /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 printf '# White-faced sapajou vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cebCap1 - White-faced sapajou - Cebus capucinus imitator SEQ2_DIR=/hive/data/genomes/cebCap1/cebCap1.2bit SEQ2_LEN=/hive/data/genomes/cebCap1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CebCap1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 293m40.906s cat fb.mm10.chainCebCap1Link.txt # 882776669 bases of 2652783500 (33.277%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cebCap1) \ > rbest.log 2>&1 & # real 334m0.458s mkdir /hive/data/genomes/cebCap1/bed/blastz.mm10.swap cd /hive/data/genomes/cebCap1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 63m12.596s cat fb.cebCap1.chainMm10Link.txt # 871126707 bases of 2610518382 (33.370%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cebCap1 mm10) \ > rbest.log 2>&1 # real 299m3.923s ############################################################################## # LASTZ White-faced spapjou/cebCap1 vs. mouse/Mm10 (DONE - 2017-10-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 cd /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 printf '# White-faced sapajou vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cebCap1 - White-faced sapajou - Cebus capucinus imitator SEQ2_DIR=/hive/data/genomes/cebCap1/cebCap1.2bit SEQ2_LEN=/hive/data/genomes/cebCap1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=18 BASE=/hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CebCap1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 206m12.413s cat fb.mm10.chainCebCap1Link.txt # 882776669 bases of 2652783500 (33.277%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cebCap1) \ > rbest.log 2>&1 & # real 331m49.541s mkdir /hive/data/genomes/cebCap1/bed/blastz.mm10.swap cd /hive/data/genomes/cebCap1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 63m12.596s cat fb.cebCap1.chainMm10Link.txt # 871126707 bases of 2610518382 (33.370%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cebCap1 mm10) \ > rbest.log 2>&1 # real 299m3.923s ############################################################################## # LASTZ Sclater's lemur mouse/Mm10 (DONE - 2017-10-04 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 cd /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 printf '# Sclater_s lemur vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: eulFla1 - Sclater_s lemur - Eulemur flavifrons SEQ2_DIR=/hive/data/genomes/eulFla1/eulFla1.2bit SEQ2_LEN=/hive/data/genomes/eulFla1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=18 BASE=/hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10EulFla1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 144m17.701s cat fb.mm10.chainEulFla1Link.txt # 916687191 bases of 2652783500 (34.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 eulFla1) \ > rbest.log 2>&1 & # real 330m53.327s mkdir /hive/data/genomes/eulFla1/bed/blastz.mm10.swap cd /hive/data/genomes/eulFla1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 65m26.113s cat fb.eulFla1.chainMm10Link.txt # 887070088 bases of 2094103399 (42.360%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` eulFla1 mm10) \ > rbest.log 2>&1 # real 270m35.579s ############################################################################## # LASTZ Black lemur mouse/Mm10 (DONE - 2017-10-05 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 cd /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 printf '# Black lemur vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: eulMac1 - Black lemur - Eulemur macaco SEQ2_DIR=/hive/data/genomes/eulMac1/eulMac1.2bit SEQ2_LEN=/hive/data/genomes/eulMac1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10EulMac1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 167m31.736s cat fb.mm10.chainEulMac1Link.txt # 925968814 bases of 2652783500 (34.906%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 eulMac1) \ > rbest.log 2>&1 & # real 334m49.287s mkdir /hive/data/genomes/eulMac1/bed/blastz.mm10.swap cd /hive/data/genomes/eulMac1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 64m52.738s cat fb.eulMac1.chainMm10Link.txt # 895308387 bases of 2101039320 (42.613%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` eulMac1 mm10) \ > rbest.log 2>&1 # real 267m17.552s ############################################################################## 2017-12-17: import of UCSC GENCODE group processing of GENCODE VM16 (markd) # being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM16 pushd /hive/data/genomes/mm10/bed/gencodeVM16 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M16 91 'Dec 2017' # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM16.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM16 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. ############################################################################## # LASTZ Damara mole rat vs. mouse/Mm10 (DONE - 2018-01-01 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 cd /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 printf '# Damara mole rat vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Damara mole rat SEQ2_DIR=/hive/data/genomes/fukDam1/fukDam1.2bit SEQ2_LEN=/hive/data/genomes/fukDam1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 403m29.477s cat fb.mm10.chainFukDam1Link.txt # 803448015 bases of 2652783500 (30.287%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 fukDam1) \ > rbest.log 2>&1 & # real 391m52.435s cat fb.mm10.chainRBestFukDam1Link.txt # 760138280 bases of 2652783500 (28.654%) in intersection mkdir /hive/data/genomes/fukDam1/bed/blastz.mm10.swap cd /hive/data/genomes/fukDam1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 79m46.564s cat fb.fukDam1.chainMm10Link.txt # 803988546 bases of 2285984782 (35.170%) in intersection cat fb.fukDam1.chainSynMm10Link.txt # 741604346 bases of 2285984782 (32.441%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` fukDam1 mm10) \ > rbest.log 2>&1 # real 417m52.847s cat fb.fukDam1.chainRBestMm10Link.txt # 760190877 bases of 2285984782 (33.254%) in intersection ############################################################################## # LASTZ Kangaroo rat vs. mouse/Mm10 (DONE - 2018-01-01 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 cd /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 printf '# Kangaroo rat vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Kangaroo rat SEQ2_DIR=/hive/data/genomes/dipOrd2/dipOrd2.2bit SEQ2_LEN=/hive/data/genomes/dipOrd2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 351m30.983s cat fb.mm10.chainDipOrd2Link.txt # 645178768 bases of 2652783500 (24.321%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 dipOrd2) \ > rbest.log 2>&1 & # real 439m56.601s cat fb.mm10.chainRBestDipOrd2Link.txt # 605074450 bases of 2652783500 (22.809%) in intersection mkdir /hive/data/genomes/dipOrd2/bed/blastz.mm10.swap cd /hive/data/genomes/dipOrd2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 79m46.564s cat fb.dipOrd2.chainMm10Link.txt # 631879699 bases of 2065314047 (30.595%) in intersection cat fb.dipOrd2.chainSynMm10Link.txt # 581661824 bases of 2065314047 (28.163%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` dipOrd2 mm10) \ > rbest.log 2>&1 # real 412m53.879s cat fb.dipOrd2.chainRBestMm10Link.txt # 605056621 bases of 2065314047 (29.296%) in intersection ############################################################################## # LASTZ Chinese hamster ovary cell line CHO-K1 criGriChoV2 # (DONE - 2018-01-05 - Hiram) # establish a screen to control this job screen -S mm10criGriChoV2 mkdir /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 cd /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 printf '# Chinese hamster ovary cell line vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chinese hamster ovary cell line CHO-K1 criGriChoV2 SEQ2_DIR=/hive/data/genomes/criGriChoV2/criGriChoV2.2bit SEQ2_LEN=/hive/data/genomes/criGriChoV2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -noDbNameCheck -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 575m28.254s cat fb.mm10.chainCriGriChoV2Link.txt # 1583859515 bases of 2652783500 (59.706%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 criGriChoV2 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 1098m32.629s cat fb.mm10.chainRBestCriGriChoV2Link.txt # 1451345011 bases of 2652783500 (54.710%) in intersection mkdir /hive/data/genomes/criGriChoV2/bed/blastz.mm10.swap cd /hive/data/genomes/criGriChoV2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 196m59.409s cat fb.criGriChoV2.chainMm10Link.txt # 1605002950 bases of 2323924942 (69.064%) in intersection cat fb.criGriChoV2.chainSynMm10Link.txt # 1443603212 bases of 2323924942 (62.119%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev criGriChoV2 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 1187m10.728s cat fb.criGriChoV2.chainRBestMm10Link.txt # 1452526554 bases of 2323924942 (62.503%) in intersection ############################################################################## # LASTZ Baboon papAnu4 (DONE - 2018-01-08 - Hiram) # establish a screen to control this job screen -S mm10papAnu4 mkdir /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 cd /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 printf '# mouse vs. baboon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: baboon papAnu4 SEQ2_DIR=/hive/data/genomes/papAnu4/papAnu4.2bit SEQ2_LEN=/hive/data/genomes/papAnu4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=180 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 783m49.438s cat fb.mm10.chainPapAnu4Link.txt # 919405716 bases of 2652783500 (34.658%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 papAnu4 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 582m15.183s cat fb.mm10.chainRBestPapAnu4Link.txt # 875366631 bases of 2652783500 (32.998%) in intersection mkdir /hive/data/genomes/papAnu4/bed/blastz.mm10.swap cd /hive/data/genomes/papAnu4/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 80m51.648s cat fb.papAnu4.chainMm10Link.txt # 907806517 bases of 2937004939 (30.909%) in intersection cat fb.papAnu4.chainSynMm10Link.txt # 866781916 bases of 2937004939 (29.512%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev papAnu4 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 521m7.590s cat fb.papAnu4.chainRBestMm10Link.txt # 874097827 bases of 2937004939 (29.762%) in intersection ############################################################################## # LASTZ guinea pig cavApe1 (DONE - 2018-01-08 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CavApe1 mkdir /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 cd /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 printf '# guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: guinea pig CavApe1 SEQ2_DIR=/hive/data/genomes/cavApe1/cavApe1.2bit SEQ2_LEN=/hive/data/genomes/cavApe1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 514m28.099s cat fb.mm10.chainCavApe1Link.txt # 424603451 bases of 2652783500 (16.006%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 cavApe1 \ -buildDir=`pwd`) > rbest.log 2>&1 & # real 481m13.804s cat fb.mm10.chainRBestCavApe1Link.txt # 394844156 bases of 2652783500 (14.884%) in intersection # and for the swap mkdir /hive/data/genomes/cavApe1/bed/blastz.mm10.swap cd /hive/data/genomes/cavApe1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 38m53.866s cat fb.cavApe1.chainMm10Link.txt # 420563721 bases of 1749140834 (24.044%) in intersection cat fb.cavApe1.chainSynMm10Link.txt # 364825271 bases of 1749140834 (20.857%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev cavApe1 mm10 \ -buildDir=`pwd`) > rbest.log 2>&1 & # real 438m45.544s cat fb.cavApe1.chainRBestMm10Link.txt # 395976886 bases of 1749140834 (22.638%) in intersection ############################################################################## # lastz Medium Ground Finch ficAlb1 (DONE - 2018-01-09 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10 mkdir /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 cd /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 printf '# Mouse vs. Collared flycatcher BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Collard flycatcher/FicAlb1 SEQ2_DIR=/hive/data/genomes/ficAlb1/ficAlb1.2bit SEQ2_LEN=/hive/data/genomes/ficAlb1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 167m34.472s cat fb.mm10.chainFicAlb1Link.txt # 98177848 bases of 2652783500 (3.701%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 ficAlb1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 246m1.019s cat fb.mm10.chainRBestFicAlb1Link.txt # 76370866 bases of 2652783500 (2.879%) in intersection # and for the swap mkdir /hive/data/genomes/ficAlb1/bed/blastz.mm10.swap cd /hive/data/genomes/ficAlb1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 8m5.637s cat fb.ficAlb1.chainMm10Link.txt # 85384367 bases of 1102325870 (7.746%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev ficAlb1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 209m22.159s cat fb.ficAlb1.chainRBestMm10Link.txt # 76183235 bases of 1102325870 (6.911%) in intersection ########################################################################## # lastz Lamprey petMar3 (DONE - 2018-01-25 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S petMar3 mkdir /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 cd /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 printf '# Mouse vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Lamprey PetMar3 SEQ2_DIR=/hive/data/genomes/petMar3/petMar3.2bit SEQ2_LEN=/hive/data/genomes/petMar3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 119m5.528s cat fb.mm10.chainPetMar3Link.txt # 36835173 bases of 2652783500 (1.389%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 petMar3) \ > rbest.log 2>&1 & # real 201m40.789s cat fb.mm10.chainRBestPetMar3Link.txt # 21623456 bases of 2652783500 (0.815%) in intersection # and for the swap mkdir /hive/data/genomes/petMar3/bed/blastz.mm10.swap cd /hive/data/genomes/petMar3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 7m57.582s # real 7m2.754s cat fb.petMar3.chainMm10Link.txt # 39217857 bases of 1043181598 (3.759%) in intersection cat fb.petMar3.chainSynMm10Link.txt # 1381239 bases of 1043181598 (0.132%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` petMar3 mm10) \ > rbest.log 2>&1 & # real 206m59.727s cat fb.petMar3.chainRBestMm10Link.txt # 21335101 bases of 1043181598 (2.045%) in intersection ######################################################################### 2018-03-08: update UCSC GENCODE VM16 to include protein id (for VAI) and fix PAR tag cd /hive/data/genomes/mm10/bed/gencodeVM16 # save existing data mkdir -p prev/pre-proteinId mv tables/wgEncodeGencodeAttrsVM16.tab tables/wgEncodeGencodeTagVM16.tab prev/pre-proteinId/ mv loaded/wgEncodeGencodeAttrsVM16.tab.loaded loaded/wgEncodeGencodeTagVM16.tab.loaded prev/pre-proteinId/ mv data/gencode.tsv prev/pre-proteinId/ cp -p data/gencode.vM16.transcriptionSupportLevel.tab prev/pre-proteinId/ # edit gencodeLoad.mk to set mm10 as target # get gencode.tsv without rebuild TSL file or loading tables that don't change ~markd/compbio/ccds/ccds2/output/bin/x86_64/opt/gencodeGxfToAttrs --keepGoing data/release_M16/gencode.vM16.chr_patch_hapl_scaff.annotation.gtf.gz data/gencode.tsv make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk loaded/wgEncodeGencodeAttrsVM16.tab.loaded loaded/wgEncodeGencodeTagVM16.tab.loaded # 2018-03-19: update search to include protein id cd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M16 91 'Dec 2017' ######################################################################### # lastz garter snake/thaSir1 (DONE - 2018-03-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ThaSir1 mkdir /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 cd /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 # note: first time with this new 1.04.00 version of lastz printf '# Mouse vs. garter snake BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: garter snake thaSir1 SEQ2_DIR=/hive/data/genomes/thaSir1/thaSir1.2bit SEQ2_LEN=/hive/data/genomes/thaSir1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 112m40.572s cat fb.mm10.chainThaSir1Link.txt # 78464036 bases of 2652783500 (2.958%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 thaSir1) \ > rbest.log 2>&1 & # real 266m17.520s cat fb.mm10.chainRBestThaSir1Link.txt # 54099233 bases of 2652783500 (2.039%) in intersection # and for the swap mkdir /hive/data/genomes/thaSir1/bed/blastz.mm10.swap cd /hive/data/genomes/thaSir1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 11m28.892s cat fb.thaSir1.chainMm10Link.txt # 63814138 bases of 1122701795 (5.684%) in intersection cat fb.thaSir1.chainSynMm10Link.txt # 20728394 bases of 1122701795 (1.846%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` thaSir1 mm10) \ > rbest.log 2>&1 & # real 234m31.934s cat fb.thaSir1.chainRBestMm10Link.txt # 54778217 bases of 1122701795 (4.879%) in intersection ############################################################################## # LASTZ cat felCat9 (DONE - 2018-03-14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat9 mkdir /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 cd /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 printf '# cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat9 SEQ2_DIR=/hive/data/genomes/felCat9/felCat9.2bit SEQ2_LEN=/hive/data/genomes/felCat9/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 395m23.091s cat fb.mm10.chainFelCat9Link.txt # 801023018 bases of 2652783500 (30.196%) in intersection time (doRecipBest.pl -load mm10 felCat9 -buildDir=`pwd` \ -workhorse=hgwdev) > rbest.log 2>&1 & # real 486m55.606s cat fb.mm10.chainRBestFelCat9Link.txt # 761411281 bases of 2652783500 (28.702%) in intersection mkdir /hive/data/genomes/felCat9/bed/blastz.mm10.swap cd /hive/data/genomes/felCat9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 70m51.860s cat fb.felCat9.chainMm10Link.txt # 779862191 bases of 2476453204 (31.491%) in intersection cat fb.felCat9.chainSynMm10Link.txt # 754481540 bases of 2476453204 (30.466%) in intersection time (doRecipBest.pl -load felCat9 mm10 -buildDir=`pwd` \ -workhorse=hgwdev) > rbest.log 2>&1 & # real 375m4.937s cat fb.felCat9.chainRBestMm10Link.txt # 760753851 bases of 2476453204 (30.719%) in intersection ############################################################################## # LASTZ Beaver casCan1 vs. mouse/Mm10 (DONE - 2018-03-19 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 cd /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 # note: first time with this new 1.04.00 version of lastz printf '# Beaver vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: Beaver SEQ2_DIR=/hive/data/genomes/casCan1/casCan1.2bit SEQ2_LEN=/hive/data/genomes/casCan1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 455m47.982s cat fb.mm10.chainCasCan1Link.txt # 969752969 bases of 2652783500 (36.556%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 casCan1) \ > rbest.log 2>&1 & # real 981m12.451s cat fb.mm10.chainRBestCasCan1Link.txt # 912108399 bases of 2652783500 (34.383%) in intersection mkdir /hive/data/genomes/casCan1/bed/blastz.mm10.swap cd /hive/data/genomes/casCan1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 100m12.450s cat fb.casCan1.chainMm10Link.txt # 1027587643 bases of 2517974654 (40.810%) in intersection cat fb.casCan1.chainSynMm10Link.txt # 876969229 bases of 2517974654 (34.828%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` casCan1 mm10) \ > rbest.log 2>&1 # real 1280m7.127s cat fb.casCan1.chainRBestMm10Link.txt # 911437520 bases of 2517974654 (36.197%) in intersection ############################################################################## # LASTZ mouse/mm10 Chimp/panTro6 - (DONE - 2018-03-24 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 cd /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 printf '# mouse vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: chimp panTro6 SEQ2_DIR=/hive/data/genomes/panTro6/panTro6.2bit SEQ2_LEN=/hive/data/genomes/panTro6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 347m21.874s cat fb.mm10.chainPanTro6Link.txt # 935720585 bases of 2652783500 (35.273%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ mm10 panTro6) > rbest.log 2>&1 & # real 565m15.871s cat fb.mm10.chainRBestPanTro6Link.txt # 891553355 bases of 2652783500 (33.608%) in intersection # and for the swap: mkdir /hive/data/genomes/panTro6/bed/blastz.mm10.swap cd /hive/data/genomes/panTro6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 78m57.631s cat fb.panTro6.chainMm10Link.txt # 934668641 bases of 3018592990 (30.964%) in intersection cat fb.panTro6.chainSynMm10Link.txt # 889944141 bases of 3018592990 (29.482%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ panTro6 mm10) > rbest.log 2>&1 & # real 504m47.811s cat fb.panTro6.chainRBestMm10Link.txt # 890065520 bases of 3018592990 (29.486%) in intersection ############################################################################## # LASTZ mouse/mm10 Orangutan/ponAbe3 - (DONE - 2018-03-26 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 cd /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 printf '# mouse vs orangutan BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: orangutan ponAbe3 SEQ2_DIR=/hive/data/genomes/ponAbe3/ponAbe3.2bit SEQ2_LEN=/hive/data/genomes/ponAbe3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 461m46.426s cat fb.mm10.chainPonAbe3Link.txt # 936755064 bases of 2652783500 (35.312%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ mm10 ponAbe3) > rbest.log 2>&1 & # real 554m41.676s cat fb.mm10.chainRBestPonAbe3Link.txt # 892145302 bases of 2652783500 (33.631%) in intersection # and for the swap: mkdir /hive/data/genomes/ponAbe3/bed/blastz.mm10.swap cd /hive/data/genomes/ponAbe3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 78m29.160s cat fb.ponAbe3.chainMm10Link.txt # 929970181 bases of 3043444524 (30.557%) in intersection cat fb.ponAbe3.chainSynMm10Link.txt # 890801507 bases of 3043444524 (29.270%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ ponAbe3 mm10) > rbest.log 2>&1 & # real 496m49.168s cat fb.ponAbe3.chainRBestMm10Link.txt # 890774155 bases of 3043444524 (29.269%) in intersection ######################################################################### # LASTZ mouse/mm10 sheep/oviAri4 - (DONE - 2018-04-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 cd /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 printf '# mouse vs sheep BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: sheep oviAri4 SEQ2_DIR=/hive/data/genomes/oviAri4/oviAri4.2bit SEQ2_LEN=/hive/data/genomes/oviAri4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # Command failed: # ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev \ # nice /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25/axtChain/netSynteny.csh # # real 237m24.916s # used the wrong version of doBlastzChainNet.pl which failed at the # syntenic net step. Clean up and re-try with the fixed up script: Otherwise skip to rmskJoined below (DONE - 2018-05-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 cd /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 printf '# mouse vs horse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: horse equCab3 SEQ2_DIR=/hive/data/genomes/equCab3/equCab3.2bit SEQ2_LEN=/hive/data/genomes/equCab3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 605m50.368s cat fb.mm10.chainEquCab3Link.txt # 921489718 bases of 2652783500 (34.737%) in intersection cat fb.mm10.chainSynEquCab3Link.txt # 876836391 bases of 2652783500 (33.053%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 equCab3) > rbest.log 2>&1 & # real 398m20.685s cat fb.mm10.chainRBest.EquCab3.txt # 876785778 bases of 2652783500 (33.052%) in intersection # and for the swap: mkdir /hive/data/genomes/equCab3/bed/blastz.mm10.swap cd /hive/data/genomes/equCab3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 83m14.250s cat fb.equCab3.chainMm10Link.txt # 930516778 bases of 2497530654 (37.257%) in intersection cat fb.equCab3.chainSynMm10Link.txt # 897238830 bases of 2497530654 (35.925%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` equCab3 mm10) > rbest.log 2>&1 & # real 318m40.520s cat fb.equCab3.chainRBest.Mm10.txt # 875954606 bases of 2497530654 (35.073%) in intersection ######################################################################### # LASTZ mouse/mm10 Minke whale/balAcu1 - (DONE - 2018-06-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 cd /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 printf '# mouse vs Minke whale BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: Minke whale balAcu1 SEQ2_DIR=/hive/data/genomes/balAcu1/balAcu1.2bit SEQ2_LEN=/hive/data/genomes/balAcu1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 190m45.265s cat fb.mm10.chainBalAcu1Link.txt # 851790136 bases of 2652783500 (32.109%) in intersection cat fb.mm10.chainSynBalAcu1Link.txt # 806407823 bases of 2652783500 (30.399%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 balAcu1) > rbest.log 2>&1 & # real 287m58.329s cat fb.mm10.chainRBest.BalAcu1.txt # 811435554 bases of 2652783500 (30.588%) in intersection # and for the swap: mkdir /hive/data/genomes/balAcu1/bed/blastz.mm10.swap cd /hive/data/genomes/balAcu1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 67m0.560s cat fb.balAcu1.chainMm10Link.txt # 832845143 bases of 2286657046 (36.422%) in intersection cat fb.balAcu1.chainSynMm10Link.txt # 802734600 bases of 2286657046 (35.105%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` balAcu1 mm10) > rbest.log 2>&1 & # real 241m51.110s cat fb.balAcu1.chainRBest.Mm10.txt # 810427625 bases of 2286657046 (35.442%) in intersection ######################################################################### 2018-07-01: import of UCSC GENCODE group processing of GENCODE VM17 (markd) # not being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM17 pushd /hive/data/genomes/mm10/bed/gencodeVM17 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M17 92 'Mar 2018' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM17.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM17 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. ############################################################################## # LASTZ mouse/mm10 Axolotl/ambMex1 - (DONE - 2018-07-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 cd /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 printf '# mouse vs Axolotl BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: Axolotl ambMex1 SEQ2_DIR=/hive/data/genomes/ambMex1/ambMex1.2bit SEQ2_LEN=/hive/data/genomes/ambMex1/chrom.sizes SEQ2_CHUNK=80000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 881m7.910s cat fb.mm10.chainAmbMex1Link.txt # 52143617 bases of 2652783500 (1.966%) in intersection cat fb.mm10.chainSynAmbMex1Link.txt # 2686570 bases of 2652783500 (0.101%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 ambMex1) > rbest.log 2>&1 & # real 478m39.331s # something odd went haywire at the download step time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` mm10 ambMex1) > download.log 2>&1 & # real 1m42.883s cat fb.mm10.chainRBest.AmbMex1.txt # 36938030 bases of 2652783500 (1.392%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex1/bed/blastz.mm10.swap cd /hive/data/genomes/ambMex1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 39m28.757s cat fb.ambMex1.chainMm10Link.txt # 87124587 bases of 28366694468 (0.307%) in intersection cat fb.ambMex1.chainSynMm10Link.txt # 2893381 bases of 28366694468 (0.010%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > rbest.log 2>&1 & # real 568m10.621s # something odd went haywire at the download step time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > download.log 2>&1 & # real 3m16.404s cat fb.ambMex1.chainRBest.Mm10.txt # 38584422 bases of 28366694468 (0.136%) in intersection ############################################################################## 2018-08-03: import of UCSC GENCODE group processing of GENCODE VM18 (markd) # being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM18 pushd /hive/data/genomes/mm10/bed/gencodeVM18 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M18 93 'July 2018' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM18.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM18 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ############################################################################## # LASTZ mouse/mm10 vs. chicken/galGal6 - (DONE - 2018-10-12 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 cd /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 printf "# Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal6 SEQ2_DIR=/hive/data/genomes/galGal6/galGal6.2bit SEQ2_LEN=/hive/data/genomes/galGal6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 84m14.188s cat fb.mm10.chainGalGal6Link.txt # 101151132 bases of 2652783500 (3.813%) in intersection cat fb.mm10.chainSynGalGal6Link.txt # 70707720 bases of 2652783500 (2.665%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 galGal6) > rbest.log 2>&1 & # real 116m19.316s cat fb.mm10.chainRBest.GalGal6.txt # 79649474 bases of 2652783500 (3.002%) in intersection # and for the swap: mkdir /hive/data/genomes/galGal6/bed/blastz.mm10.swap cd /hive/data/genomes/galGal6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 6m41.043s cat fb.galGal6.chainMm10Link.txt # 88539346 bases of 1055588482 (8.388%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` galGal6 mm10) > rbest.log 2>&1 & # real 94m11.007s cat fb.galGal6.chainRBest.Mm10.txt # 79474812 bases of 1055588482 (7.529%) in intersection ######################################################################### # LASTZ mouse/mm10 Minke whale/bosTau9 - (DONE - 2018-11-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 cd /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 printf '# mouse vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: cow bosTau9 SEQ2_DIR=/hive/data/genomes/bosTau9/bosTau9.2bit SEQ2_LEN=/hive/data/genomes/bosTau9/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 211m46.258s cat fb.mm10.chainBosTau9Link.txt # 703580224 bases of 2652783500 (26.522%) in intersection cat fb.mm10.chainSynBosTau9Link.txt # 659095603 bases of 2652783500 (24.845%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 bosTau9) > rbest.log 2>&1 & # real 214m24.819s cat fb.mm10.chainRBest.BosTau9.txt # 667950653 bases of 2652783500 (25.179%) in intersection # and for the swap: mkdir /hive/data/genomes/bosTau9/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 41m22.962s cat fb.bosTau9.chainMm10Link.txt # 695248613 bases of 2715853792 (25.600%) in intersection cat fb.bosTau9.chainSynMm10Link.txt # 660591041 bases of 2715853792 (24.324%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` bosTau9 mm10) > rbest.log 2>&1 & # real 204m36.465s cat fb.bosTau9.chainRBest.Mm10.txt # 667305554 bases of 2715853792 (24.571%) in intersection ######################################################################### 2018-11-10: import of UCSC GENCODE group processing of GENCODE VM19 (markd) # not being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM19 pushd /hive/data/genomes/mm10/bed/gencodeVM19 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M19 94 'Oct 2018' ## only if being pushed to RR: (skipped) # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: (SKIPPED) # edit all.joiner to add ~/tmp/gencodeVM19.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM19 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ############################################################################## # LASTZ mouse/mm10 vs. Japanese quail/cotJap2 - (DONE - 2018-11-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 cd /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 printf "# Mouse vs. Japanese quail BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Japanese quail cotJap2 SEQ2_DIR=/hive/data/genomes/cotJap2/cotJap2.2bit SEQ2_LEN=/hive/data/genomes/cotJap2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 82m16.032s cat fb.mm10.chainCotJap2Link.txt # 97251364 bases of 2652783500 (3.666%) in intersection cat fb.mm10.chainSynCotJap2Link.txt # 67653818 bases of 2652783500 (2.550%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 cotJap2) > rbest.log 2>&1 & # real 104m58.905s cat fb.mm10.chainRBest.CotJap2.txt # 76298136 bases of 2652783500 (2.876%) in intersection # and for the swap: mkdir /hive/data/genomes/cotJap2/bed/blastz.mm10.swap cd /hive/data/genomes/cotJap2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 6m37.873s cat fb.cotJap2.chainMm10Link.txt # 82592561 bases of 917263224 (9.004%) in intersection cat fb.cotJap2.chainSynMm10Link.txt # 66583746 bases of 917263224 (7.259%) in intersection # mistakenly started this on ku, it failed at the download step since # it could not see the /gbdb/mm10/ hierarchy: time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` cotJap2 mm10) > rbest.log 2>&1 & # real 79m48.767s # continue on hgwdev time (doRecipBest.pl -load -workhorse=hgwdev -continue=download -buildDir=`pwd` cotJap2 mm10) > rbest.download.log 2>&1 & # real 1m40.970s cat fb.cotJap2.chainRBest.Mm10.txt # 76078816 bases of 917263224 (8.294%) in intersection ############################################################################## 2018-11-30: import of UCSC GENCODE group processing of GENCODE VM20 prerelease (markd) # This is a prerelease for testing and is *not* to pushed until the final release. # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM20 pushd /hive/data/genomes/mm10/bed/gencodeVM20 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M20 95 'Dec 2018' ## only if being pushed to RR: (skipped) # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: (SKIPPED) # edit all.joiner to add ~/tmp/gencodeVM20.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM20 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ######################################################################### 2019-01-17: tabula muris track (max) # download 7Tb of data from Amazon, using token, CZI pays (got token by email, via Angela Pisco, James Webber) export AWS_ACCESS_KEY_ID=xxxxx export AWS_SESSION_TOKEN=xxxxx aws s3 sync s3://czbiohub-tabula-muris/tabula_muris_bam_files/ . --delete cd ~/projects/czi/cbData/ucsc/tabulaMuris csvToTab TM_facs_metadata.csv > TM_facs_metadata.tsv cat TM_facs_metadata.csv | tr '.' '-' | csvToTab > TM_facs_metadata.fix.tsv see if the liftOver menus function in the browser from mm10 to GRCm38B real 1m35.590s real 126m0.688s 267m58.813s generate trackDb and joiner blurb download, build and load tables commit all real 196m22.733s real 106m31.449s real 211m21.922s real 52m48.045s Results are in gencode-cmp.tsv