6c02988b8ef7077f418bd3185126666f612759b1 markd Thu Sep 26 10:28:41 2019 -0700 import of gencode V32 final diff --git src/hg/makeDb/doc/mm10.txt src/hg/makeDb/doc/mm10.txt index 7a9f9ac..7aba9a1 100644 --- src/hg/makeDb/doc/mm10.txt +++ src/hg/makeDb/doc/mm10.txt @@ -1,18122 +1,18124 @@ # for emacs: -*- mode: sh; -*- # This file describes browser build for the mm10 # Mus musculus (mouse) # DATE: 07-Dec-2011 # ORGANISM: Mus musculus # TAXID: 10090 # ASSEMBLY LONG NAME: Genome Reference Consortium Mouse Build 38 # ASSEMBLY SHORT NAME: GRCm38 # ASSEMBLY SUBMITTER: Genome Reference Consortium # ASSEMBLY TYPE: Haploid + alternate loci # NUMBER OF ASSEMBLY-UNITS: 16 # ASSEMBLY ACCESSION: GCA_000001635.2 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/ # Genome ID: # http://www.ncbi.nlm.nih.gov/genome/52 # Taxonomy: # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=10090 # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=39442 # GRC information # http://www.ncbi.nlm.nih.gov/projects/genome/assembly/grc/mouse/ # Mitochondrial sequence: # http://www.ncbi.nlm.nih.gov/bioproject/13767 # C57BL/6J sequence: # http://www.ncbi.nlm.nih.gov/bioproject/51977 # Finishing project: # http://www.ncbi.nlm.nih.gov/bioproject/20689 # Assembly ID: 327618 # http://www.ncbi.nlm.nih.gov/genome/assembly/327618/ # Celera Assembly # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAHY00 ############################################################################# # fetch sequence from genbank (DONE - 2012-01-30 - Hiram) mkdir -p /hive/data/genomes/mm10/genbank cd /hive/data/genomes/mm10/genbank rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38/ ./ # measure sequence to be used here faSize Primary_Assembly/assembled_chromosomes/FASTA/*.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/*.fa.gz \ non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 lower) # in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (gi|371559559|gb|JH584295.1|) max 195471971 # (gi|371561115|gb|CM000994.2|) median 184189 ############################################################################# # fixup names for UCSC standards (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/ucsc cd /hive/data/genomes/mm10/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > toUcsc.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.agp.gz|") or die "can not read chr${chrN}.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x toUcsc.pl time ./toUcsc.pl # real 0m53.256s faSize chr*.fa # 2725521370 bases (77999939 N's 2647521431 real 2647521431 upper 0 # lower) in 21 sequences in 21 files # Total size: mean 129786731.9 sd 33408399.1 min 61431566 (chr19) # max 195471971 (chr1) median 124902244 ######################## Unplaced scaffolds cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">unplaced.agp") or die "can not write to unplaced.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">unplaced.fa") or die "can not write to unplaced.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1\|.*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m0.119s # make sure none of the names got to be over 31 characers long: grep -v "^#" unplaced.agp | cut -f1 | sort | uniq -c | sort -rn # not much in that sequence: faSize unplaced.fa # 803895 bases (62411 N's 741484 real 741484 upper 0 lower) # in 22 sequences in 1 files # Total size: mean 36540.7 sd 21518.0 min 20208 (chrUn_GL456368) # max 114452 (chrUn_JH584304) median 28772 ########## chrM zcat ../genbank/non-nuclear/assembled_chromosomes/FASTA/chrMT.fa.gz \ | sed -e "s/^>.*/>chrM/" > chrM.fa zcat ../genbank/non-nuclear/assembled_chromosomes/AGP/chrMT.comp.agp.gz \ | sed -e "s/^AY172335.1/chrM/" > chrM.agp ######################## Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\|.*//; my $accNo1 = $acc; $accNo1 =~ s/.1$//; die "ERROR: acc not .1: $acc" if ($accNo1 =~ m/\./); die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${accNo1}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m0.430s faSize chr*_random.fa # 4530210 bases (25924 N's 4504286 real 4504286 upper 0 lower) # in 22 sequences in 6 files # Total size: mean 205918.6 sd 184688.0 min 1976 (chr4_JH584295_random) # max 953012 (chr5_JH584299_random) median 191905 # verify none of the names are longer than 31 characters: grep -h -v "^#" chr*_random.agp | cut -f1 | sort | uniq -c | sort -nr # compress all these fasta and agp files: gzip *.fa *.agp # verify all the sequence is still here after all this rigamarole: time faSize *.fa.gz # 2730871774 bases (78088274 N's 2652783500 real 2652783500 upper 0 # lower) in 66 sequences in 29 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 ############################################################################# # Initial browser build (DONE - 2012-01-06 - Hiram) cd /hive/data/genomes/mm10 cat << '_EOF_' > mm10.config.ra # Config parameters for makeGenomeDb.pl: db mm10 clade mammal genomeCladePriority 40 scientificName Mus musculus commonName Mouse assemblyDate Dec. 2011 assemblyLabel Genome Reference Consortium Mouse Build 38 (GCA_000001635.2) assemblyShortLabel GRCm38 orderKey 1209 mitoAcc none fastaFiles /hive/data/genomes/mm10/ucsc/*.fa.gz agpFiles /hive/data/genomes/mm10/ucsc/*.agp.gz dbDbSpeciesDir mouse taxId 10090 ncbiAssemblyId 327618 ncbiAssemblyName GRCm38 '_EOF_' # << happy emacs time makeGenomeDb.pl -stop=agp mm10.config.ra > agp.log 2>&1 # real 3m4.568s # check the end of agp.log to verify it is OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev \ -continue=db mm10.config.ra > db.log 2>&1 # real 20m51.374s # verify the end of db.log indicates successful ############################################################################# # running repeat masker (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/repeatMasker cd /hive/data/genomes/mm10/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek mm10 > do.log 2>&1 & # real 609m48.767s cat faSize.rmsk.txt # 2730871774 bases (78088274 N's 2652783500 real 1456094545 upper # 1196688955 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.82 masked total, %45.11 masked real grep -i versi do.log # RepeatMasker version development-$Id: RepeatMasker,v 1.26 2011/09/26 16:19:44 angie Exp $ # April 26 2011 (open-3-3-0) version of RepeatMasker time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/simpleRepeat cd /hive/data/genomes/mm10/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ mm10 > do.log 2>&1 & # real 16m35.603s # batch failed, one job failed: # ./TrfRun.csh /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # which is the chrM sequence - it has no simple repeats # create an empty output file result: touch /hive/data/genomes/mm10/TrfPart/062/062.lst.bed # go to encodek and create the run.time file to signal this step is done cd /hive/data/genomes/mm10/bed/simpleRepeat/run.cluster para time > run.time # Completed: 70 of 71 jobs # Crashed: 1 jobs # CPU time in finished jobs: 13103s 218.38m 3.64h 0.15d 0.000 y # IO & Wait Time: 163s 2.72m 0.05h 0.00d 0.000 y # Average job time: 190s 3.16m 0.05h 0.00d # Longest finished job: 392s 6.53m 0.11h 0.00d # Submission to last job: 894s 14.90m 0.25h 0.01d # continue procedure: time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ -continue=filter mm10 > filter.log 2>&1 & # real 1m20.021s cat fb.simpleRepeat # 92161833 bases of 2652783500 (3.474%) in intersection # when RepeatMasker is done, add this mask to the sequence: cd /hive/data/genomes/mm10 twoBitMask mm10.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed mm10.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa mm10.2bit stdout | faSize stdin > faSize.mm10.2bit.txt cat faSize.mm10.2bit.txt # 2730871774 bases (78088274 N's 2652783500 real 1454267808 upper # 1198515692 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %43.89 masked total, %45.18 masked real # set SymLink in gbdb to this masked sequence rm /gbdb/mm10/mm10.2bit ln -s `pwd`/mm10.2bit /gbdb/mm10/mm10.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/gap cd /hive/data/genomes/mm10/bed/gap time nice -n +19 findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../mm10.unmasked.2bit > findMotif.txt 2>&1 # real 1m0.372s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits -countGaps mm10 -not gap -bed=notGap.bed # 2658879040 bases of 2730871774 (97.364%) in intersection # real 0m13.067s time featureBits -countGaps mm10 allGaps.bed notGap.bed -bed=new.gaps.bed # 6095540 bases of 2730871774 (0.223%) in intersection # real 0m15.177s # what is the highest index in the existing gap table: hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1 # 54 cat << '_EOF_' > mkGap.pl #!/bin/env perl use strict; use warnings; my $ix=`hgsql -N -e "select ix from gap;" mm10 | sort -n | tail -1`; chomp $ix; open (FH,"<new.gaps.bed") or die "can not read new.gaps.bed"; while (my $line = <FH>) { my ($chrom, $chromStart, $chromEnd, $rest) = split('\s+', $line); ++$ix; printf "%s\t%d\t%d\t%d\tN\t%d\tother\tyes\n", $chrom, $chromStart, $chromEnd, $ix, $chromEnd-$chromStart; } close (FH); '_EOF_' # << happy emacs chmod +x ./mkGap.pl ./mkGap.pl > other.bed wc -l other.bed # 384 featureBits -countGaps mm10 other.bed # 6095540 bases of 2730871774 (0.223%) in intersection hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/gap.sql \ -noLoad mm10 otherGap other.bed # verify no overlap with gap table: time featureBits -countGaps mm10 gap other.bed # 0 bases of 2730871774 (0.000%) in intersection # real 0m1.281s # verify no errors before adding to the table: time gapToLift -minGap=1 mm10 nonBridged.before.lift \ -bedFile=nonBridged.before.bed > before.gapToLift.txt 2>&1 & # real 0m7.205s # check for warnings in before.gapToLift.txt, should be empty: # -rw-rw-r-- 1 1633 Jan 6 15:20 before.gapToLift.txt # it indicates that there are telomere's adjacent to centromere's # and heterochromatin # starting with this many: hgsql -e "select count(*) from gap;" mm10 # 302 hgsql mm10 -e 'load data/genomes local infile "bed.tab" into table gap;' # result count: hgsql -e "select count(*) from gap;" mm10 # 686 # == 302 + 384 # verify we aren't adding gaps where gaps already exist # this would output errors if that were true: gapToLift -minGap=1 mm10 nonBridged.lift -bedFile=nonBridged.bed #same set of warnings as before, telomere's centromere's and heterochromatin # there should be no errors or other output, checked bridged gaps: hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes ########################################################################## ## WINDOWMASKER (DONE - 2012-02-06 - Hiram) mkdir /hive/data/genomes/mm10/bed/windowMasker cd /hive/data/genomes/mm10/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev mm10 > do.log 2>&1 & # real 167m12.012s # Masking statistics twoBitToFa mm10.wmsk.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1686407708 upper # 966375792 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.39 masked total, %36.43 masked real twoBitToFa mm10.wmsk.sdust.2bit stdout | faSize stdin # 2730871774 bases (78088274 N's 2652783500 real 1670424648 upper # 982358852 lower) in 66 sequences in 1 files # Total size: mean 41376845.1 sd 63617337.3 min 1976 # (chr4_JH584295_random) max 195471971 (chr1) median 184189 # %35.97 masked total, %37.03 masked real hgLoadBed mm10 windowmaskerSdust windowmasker.sdust.bed.gz # Loaded 12655947 elements of size 3 featureBits -countGaps mm10 windowmaskerSdust # 1060447084 bases of 2730871774 (38.832%) in intersection # eliminate the gaps from the masking featureBits mm10 -not gap -bed=notGap.bed # 2652783500 bases of 2652783500 (100.000%) in intersection time nice -n +19 featureBits mm10 windowmaskerSdust notGap.bed \ -bed=stdout | gzip -c > cleanWMask.bed.gz # 982358852 bases of 2652783500 (37.031%) in intersection # real 1m42.449s # reload track to get it clean hgLoadBed mm10 windowmaskerSdust cleanWMask.bed.gz # Loaded 12655987 elements of size 4 time featureBits -countGaps mm10 windowmaskerSdust # 982358852 bases of 2730871774 (35.972%) in intersection # real 1m13.889s # do *not* need to mask with this clean result since RepeatMasker # does a very good job here. Using RM masking instead. zcat cleanWMask.bed.gz \ | twoBitMask ../../mm10.unmasked.2bit stdin \ -type=.bed mm10.cleanWMSdust.2bit twoBitToFa mm10.cleanWMSdust.2bit stdout | faSize stdin \ > mm10.cleanWMSdust.faSize.txt cat mm10.cleanWMSdust.faSize.txt # how much does this window masker and repeat masker overlap: time featureBits -countGaps mm10 rmsk windowmaskerSdust # 753614881 bases of 2730871774 (27.596%) in intersection # real 1m42.691s # RM by itself: time featureBits -countGaps mm10 rmsk # 1196694219 bases of 2730871774 (43.821%) in intersection # real 0m30.460s ############################################################################# # PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2012-02-07 - Hiram) ssh encodek mkdir /hive/data/genomes/mm10/bed/linSpecRep cd /hive/data/genomes/mm10/bed/linSpecRep # split the RM output by chromosome name into separate files mkdir rmsk dateRepeats head -3 ../repeatMasker/mm10.sorted.fa.out > rmsk.header.txt headRest 3 ../repeatMasker/mm10.sorted.fa.out \ | splitFileByColumn -ending=.out -col=5 -head=rmsk.header.txt stdin rmsk ls -1S rmsk/* > rmOut.list cat << '_EOF_' > mkLSR #!/bin/csh -fe rm -f dateRepeats/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus /scratch/data/genomes/RepeatMasker/DateRepeats \ $1 -query mouse -comp human -comp rat -comp dog -comp cow mv $1_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus dateRepeats '_EOF_' # << happy emacs chmod +x mkLSR cat << '_EOF_' > template #LOOP ./mkLSR $(path1) {check out line+ dateRepeats/$(file1)_homo-sapiens_rattus_canis-lupus-familiaris_bos-taurus} #ENDLOOP '_EOF_' # << happy emacs gensub2 rmOut.list single template jobList para create jobList para try ... check ... push ... etc... para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 1743s 29.05m 0.48h 0.02d 0.000 y # IO & Wait Time: 190s 3.16m 0.05h 0.00d 0.000 y # Average job time: 29s 0.49m 0.01h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 160s 2.67m 0.04h 0.00d mkdir notInHuman notInRat notInDog notInCow for F in dateRepeats/chr*.out_homo-sapiens* do B=`basename ${F}` B=${B/.out*/} echo $B /cluster/bin/scripts/extractRepeats 1 ${F} > \ notInHuman/${B}.out.spec /cluster/bin/scripts/extractRepeats 2 ${F} > \ notInRat/${B}.out.spec /cluster/bin/scripts/extractRepeats 3 ${F} > \ notInDog/${B}.out.spec /cluster/bin/scripts/extractRepeats 4 ${F} > \ notInCow/${B}.out.spec done # notInDog, and notInCow ended up being identical. # The notInRat and notInHuman are different # To check identical find . -name "*.out.spec" | \ while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \ | sort -k1,1n | sort -t"/" -k3,3 > check.same # this produces a count of 2 for the sums for Cow and Dog, all the same egrep "Cow|Dog" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # this does not produce a count of 2 for the sums for Cow and Human egrep "Cow|Human" check.same | awk '{print $1}' | sort | uniq -c | sort -rn # Copy to data/genomes staging for cluster replication mkdir /hive/data/genomes/staging/data/genomes/mm10 rsync -a -P ./notInRat/ /hive/data/genomes/staging/data/genomes/mm10/notInRat/ rsync -a -P ./notInHuman/ /hive/data/genomes/staging/data/genomes/mm10/notInHuman/ rsync -a -P ./notInCow/ /hive/data/genomes/staging/data/genomes/mm10/notInOthers/ # We also need the nibs for the lastz runs with lineage specific repeats mkdir /hive/data/genomes/mm10/nib cd /hive/data/genomes/mm10 cut -f1 chrom.sizes | while read C do twoBitToFa -seq=${C} mm10.2bit stdout | faToNib -softMask stdin nib/${C}.nib ls -og nib/$C.nib done # verify one is properly masked: nibFrag -masked nib/chrM.nib 0 16299 + stdout | less # compare to: twoBitToFa -seq=chrM mm10.fa stdout | less # Copy to data/genomes staging for cluster replication rsync -a -P ./nib/ /hive/data/genomes/staging/data/genomes/mm10/nib/ ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2012-02-08 - Hiram) # Use -repMatch=650, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 2652783500 / 2897316137 \) \* 1024 # ( 2652783500 / 2897316137 ) * 1024 = 937.574699 # round up to 1000 (mm9 used 912) cd /hive/data/genomes/mm10 time blat mm10.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/mm10.11.ooc -repMatch=1000 # Wrote 27208 overused 11-mers to jkStuff/mm10.11.ooc # real 2m9.568s # at repMatch=900: # Wrote 31822 overused 11-mers to jkStuff/mm10.11.ooc # there are non-bridged gaps, make lift file for genbank hgsql -N -e "select bridge from gap;" mm10 | sort | uniq -c # 191 no # 495 yes cd /hive/data/genomes/mm10/jkStuff gapToLift mm10 mm10.nonBridged.lift -bedFile=mm10.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' mm10.nonBridged.bed | sort -nr | head 116378660 chr2 59120641 175499301 chr2.02 # copy all of this stuff to the klusters: cd /hive/data/genomes/mm10 mkdir /hive/data/genomes/staging/data/genomes/mm10 cp -p jkStuff/mm10.11.ooc jkStuff/mm10.nonBridged.lift chrom.sizes \ mm10.2bit /hive/data/genomes/staging/data/genomes/mm10 # request rsync copy from cluster admin ######################################################################### # AUTO UPDATE GENBANK (DONE - 2012-02-08 - Hiram) # examine the file: /cluster/data/genomes/genbank/data/genomes/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Mus musculus 334577 4853663 26288 # to decide which "native" mrna or ests you want to specify in genbank.conf # of course, mm10 has plenty of everything ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add mm10 just after mm9 and commit to GIT # mm10 mm10.serverGenome = /hive/data/genomes/mm10/mm10.2bit mm10.clusterGenome = /scratch/data/genomes/mm10/mm10.2bit mm10.ooc = /scratch/data/genomes/mm10/mm10.11.ooc mm10.align.unplacedChroms = chr* mm10.lift = /scratch/data/genomes/mm10/mm10.nonBridged.lift mm10.refseq.mrna.native.pslCDnaFilter = ${finished.refseq.mrna.native.pslCDnaFilter} mm10.refseq.mrna.xeno.pslCDnaFilter = ${finished.refseq.mrna.xeno.pslCDnaFilter} mm10.genbank.mrna.native.pslCDnaFilter = ${finished.genbank.mrna.native.pslCDnaFilter} mm10.genbank.mrna.xeno.pslCDnaFilter = ${finished.genbank.mrna.xeno.pslCDnaFilter} mm10.genbank.est.native.pslCDnaFilter = ${finished.genbank.est.native.pslCDnaFilter} mm10.downloadDir = mm10 mm10.refseq.mrna.xeno.load = yes mm10.refseq.mrna.xeno.loadDesc = yes mm10.mgc = yes mm10.genbank.mrna.blatTargetDb = yes # mm10.ccds.ncbiBuild = 37.2 # mm10.upstreamGeneTbl = refGene # mm10.upstreamMaf = multiz30way # /hive/data/genomes/mm10/bed/multiz30way/species.list # end of section added to etc/genbank.conf git commit -m "adding mm10 definitions" genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen # long running job managed in screen cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbAlignStep -initial mm10 & # var/build/logs/2012.02.08-11:38:50.mm10.initalign.log # real 795m52.388s # load data/genomesbase when finished ssh hgwdev cd /cluster/data/genomes/genbank time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm10 & # logFile: var/dbload/hgwdev/logs/2012.02.09-10:05:25.dbload.log # real 114m56.461s # enable daily alignment and update of hgwdev (DONE - 2012-02-09 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add mm10 to: etc/align.dbs etc/hgwdev.dbs git commit -m "Added mm10." etc/align.dbs etc/hgwdev.dbs git push make etc-update ############################################################################ # running cpgIsland business (DONE - 2012-02-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/cpgIsland cd /hive/data/genomes/mm10/bed/cpgIsland # use a previous binary for this program ln -s ../../../mm9/bed/cpgIsland/hg3rdParty/cpgIslands/cpglh.exe . mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done ssh swarm cd /hive/data/genomes/mm10/bed/cpgIsland mkdir results cut -f1 ../../chrom.sizes > chr.list cat << '_EOF_' > template #LOOP ./runOne $(root1) {check out exists results/$(root1).cpg} #ENDLOOP '_EOF_' # << happy emacs # the faCount business is to make sure there is enough sequence to # work with in the fasta. cpglh.exe does not like files with too many # N's - it gets stuck. cat << '_EOF_' > runOne #!/bin/csh -fe set C = `faCount hardMaskedFa/$1.fa | egrep -v "^#seq|^total" | awk '{print $2 - $7 }'` if ( $C > 200 ) then ./cpglh.exe hardMaskedFa/$1.fa > /scratch/tmp/$1.$$ mv /scratch/tmp/$1.$$ $2 else touch $2 endif '_EOF_' # << happy emacs chmod +x runOne gensub2 chr.list single template jobList para create jobList para try para check ... etc para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 191s 3.19m 0.05h 0.00d 0.000 y # IO & Wait Time: 189s 3.14m 0.05h 0.00d 0.000 y # Average job time: 6s 0.10m 0.00h 0.00d # Longest finished job: 19s 0.32m 0.01h 0.00d # Submission to last job: 51s 0.85m 0.01h 0.00d # Transform cpglh output to bed + catDir results | awk '{ $2 = $2 - 1; width = $3 - $2; printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n", $1, $2, $3, $5,$6, width, $6, width*$7*0.01, 100.0*2*$6/width, $7, $9); }' > cpgIsland.bed # verify longest unique chrom name: cut -f1 cpgIsland.bed | awk '{print length($0)}' | sort -rn | head -1 # 20 # update the length 14 in the template to be 16: sed -e "s/14/20/" $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandExt.sql cd /hive/data/genomes/mm10/bed/cpgIsland hgLoadBed mm10 cpgIslandExt -tab -sqlTable=cpgIslandExt.sql cpgIsland.bed # Loaded 16023 elements of size 10 featureBits mm10 cpgIslandExt # 10495450 bases of 2652783500 (0.396%) in intersection # compare to previous: featureBits mm9 cpgIslandExt # 10496250 bases of 2620346127 (0.401%) in intersection # there should be no output from checkTableCoords: checkTableCoords -verboseBlocks -table=cpgIslandExt mm10 # cleanup, unless you want to move them to the genscan procedure below rm -fr hardMaskedFa ######################################################################### # GENSCAN GENE PREDICTIONS (DONE - 2012-02-09,10 - Hiram) mkdir /hive/data/genomes/mm10/bed/genscan cd /hive/data/genomes/mm10/bed/genscan # use a previously existing genscan binary ln -s ../../../mm9/bed/genscan/hg3rdParty . # create hard masked .fa files mkdir -p hardMaskedFa cut -f1 ../../chrom.sizes | while read C do echo ${C} twoBitToFa ../../mm10.2bit:$C stdout \ | maskOutFa stdin hard hardMaskedFa/${C}.fa done # Generate a list file, genome.list, of all the hard-masked contig chunks: find ./hardMaskedFa/ -type f | sed -e 's#^./##' > genome.list wc -l genome.list # 66 genome.list # Run on small cluster (more mem than big cluster). ssh encodek cd /hive/data/genomes/mm10/bed/genscan # Make 3 subdirectories for genscan to put their output files in mkdir gtf pep subopt # Create template file, template, for gensub2. For example (3-line file): cat << '_EOF_' > template #LOOP /cluster/bin/x86_64/gsBig {check in exists+ $(path1)} {check out exists gtf/$(root1).gtf} -trans={check out exists pep/$(root1).pep} -subopt={check out exists subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 #ENDLOOP '_EOF_' # << emacs gensub2 genome.list single template jobList para create jobList para try para check ... etc... para time # Crashed: 2 jobs # CPU time in finished jobs: 171336s 2855.60m 47.59h 1.98d 0.005 y # IO & Wait Time: 261s 4.35m 0.07h 0.00d 0.000 y # Average job time: 2640s 44.00m 0.73h 0.03d # Longest finished job: 22618s 376.97m 6.28h 0.26d # Submission to last job: 28682s 478.03m 7.97h 0.33d # one of the two crashed jobs was just a stray line in the jobList, # somehow a line with the string: '_EOF_' got in there. # as with mm9, chr7 did not work. Break it up into pieces mkdir /hive/data/genomes/mm10/bed/genscan/chr7Split cd /hive/data/genomes/mm10/bed/genscan/chr7Split grep chr7 ../../../jkStuff/mm10.nonBridged.lift | grep -v random \ > chr7.nonBridged.lift faToTwoBit ../hardMaskedFa/chr7.fa chr7.2bit ~/kent/src/hg/utils/lft2BitToFa.pl chr7.2bit chr7.nonBridged.lift \ | sed -e "s/chr7./chr7_/" > chr7.nonBridged.fa faSplit sequence chr7.nonBridged.fa 100 split7/chr7_ ln -s ../../../../mm9/bed/genscan/hg3rdParty . echo '#!/bin/sh' > cmdList.sh ls split7 | while read F do echo "/cluster/bin/x86_64/gsBig split7/${F} gtf/${F}.gtf} -trans=pep/${F}.pep} -subopt=subopt/${F}.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000 &" done >> cmdList.sh echo "wait" >> cmdList.sh chmod +x cmdList.sh mkdir gtf pep subopt time ./cmdList.sh > run.log 2>&1 # about 20 minutes # fix the names in the lift file cat chr7.nonBridged.lift | sed -e "s/chr7./chr7_/" > chr7.lift # the sed mangling will provide unique names for them all, but they # will not be in the strict numerical order that genscan usually produces cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.gtf cat subopt/chr7_*.bed | liftUp -type=.bed stdout chr7.lift error stdin \ | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.subopt.bed cat pep/chr7_*.pep | sed -e "s/chr7_0\([0-4]\)\./chr7.\1/g" > chr7.pep cp -p chr7.pep ../pep cp -p chr7.gtf ../gtf cp -p chr7.subopt.bed ../subopt/chr7.bed find ./gtf -type f | xargs -n 256 endsInLf -zeroOk # Concatenate results: cd /hive/data/genomes/mm10/bed/genscan find ./gtf -type f | xargs cat > genscan.gtf find ./pep -type f | xargs cat > genscan.pep find ./subopt -type f | xargs cat > genscanSubopt.bed # Load into the data/genomesbase (without -genePredExt because no frame info): # Don't load the Pep anymore -- redundant since it's from genomic. ssh hgwdev cd /hive/data/genomes/mm10/bed/genscan # to construct a local file with the genePred business: gtfToGenePred genscan.gtf genscan.gp # this produces exactly the same thing and loads the table: ldHgGene -gtf mm10 genscan genscan.gtf # Read 45012 transcripts in 323529 lines in 1 files # 45012 groups 59 seqs 1 sources 1 feature types # 45012 gene predictions hgLoadBed mm10 genscanSubopt genscanSubopt.bed # Read 526572 elements of size 6 from genscanSubopt.bed featureBits mm10 genscan # 55743040 bases of 2652783500 (2.101%) in intersection # previously: featureBits mm9 genscan # 55293837 bases of 2620346127 (2.110%) in intersection ######################################################################### # CREATE MICROSAT TRACK (DONE - 2012-02-09 - Hiram ssh hgwdev mkdir /cluster/data/genomes/mm10/bed/microsat cd /cluster/data/genomes/mm10/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed mm10 microsat microsat.bed # Read 197237 elements of size 4 from microsat.bed ######################################################################### # BLATSERVERS ENTRY (DONE - 2012-02-09 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17832", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("mm10", "blat13", "17833", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # set default position the same as was mm9 via blat # (DONE - 2012-02-09 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chr12:56694976-56714605" where name="mm10";' \ hgcentraltest ############################################################################ # constructing downloads (DONE - 2012-02-09 - Hiram) cd /hive/data/genomes/mm10 # some of the smaller bits are missing the simple repeat results time makeDownloads.pl -allowMissedTrfs -workhorse=hgwdev mm10 # real 41m42.408s # edit the README files in goldenPath/*/README.txt ######################################################################### # create pushQ entry (DONE - 2012-02-09 - Hiram) # first make sure all.joiner is up to date and has this new organism # a keys check should be clean: cd ~/kent/src/hg/makeDb/schema joinerCheck -data/genomesbase=mm10 -keys all.joiner mkdir /hive/data/genomes/mm10/pushQ cd /hive/data/genomes/mm10/pushQ makePushQSql.pl mm10 > mm10.sql 2> stderr.out # check stderr.out for no significant problems, it is common to see: # WARNING: hgwdev does not have /gbdb/mm10/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/mm10/wib/quality.wib # WARNING: hgwdev does not have /gbdb/mm10/bbi/quality.bw # WARNING: mm10 does not have seq # WARNING: mm10 does not have extFile # *** All done! # which are not real problem # if some tables are not identified: # WARNING: Could not tell (from trackDb, all.joiner and hardcoded lists of # supporting and genbank tables) which tracks to assign these tables to: # list of tables will be in the output # put them in manually after loading the pushQ entry scp -p mm10.sql hgwbeta:/tmp ssh hgwbeta cd /tmp hgsql qapushq < mm10.sql ######################################################################### # lifting ensGene track from mm9 (DONE - 2012-02-22 - Hiram) # no gene tracks yet on mm10. liftUp mm9 ensGenes to mm10 # history of mm9 ensGene indicates it is the same as v64 release # with v65 being identical mkdir /hive/data/genomes/mm10/bed/ensGene cd /hive/data/genomes/mm10/bed/ensGene hgsql -N -e "select * from ensGene;" mm9 | cut -f2- > mm9.ensGene.gp liftOver -genePred mm9.ensGene.gp \ /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz \ mm10.lifted.ensGene.gp unmapped.ensGene.gp wc -l *.gp # 95651 mm10.lifted.ensGene.gp # 95883 mm9.ensGene.gp # 464 unmapped.ensGene.gp hgLoadGenePred -skipInvalid -genePredExt mm10 ensGene mm10.lifted.ensGene.gp # Warning: skipping 118 invalid genePreds # make a list of what did get loaded: hgsql -N -e "select name from ensGene;" mm10 \ | sort -u > mm10.name.ensGene.txt wc -l mm10.name.ensGene.txt # 95533 mm10.name.ensGene.txt hgsql -N -e "select * from ensPep;" mm9 | sort > mm9.ensPep.tab hgsql -N -e "select * from ensGtp;" mm9 | sort -k2,2 > mm9.ensGtp.tab hgsql -N -e "select * from ensemblToGeneName;" mm9 | sort -k1,1 \ > mm9.ensemblToGeneName.tab hgsql -N -e "select * from ensemblSource;" mm9 | sort -k1,1 \ > mm9.ensemblSource.tab # select out ensGtp records that match with the names in mm10 ensGene: join -1 2 -2 1 -o "1.1,1.2,1.3" mm9.ensGtp.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensGtp.tab wc -l *.ensGtp.tab # 95533 mm10.ensGtp.tab # 95883 mm9.ensGtp.tab # select out ensPep records that match with the names in mm10 ensGene: join -1 1 -2 2 -o "1.1,1.2" mm9.ensPep.tab mm10.ensGtp.tab \ | tr '[ ]' '[\t]' > mm10.ensPep.tab wc -l mm9.ensPep.tab mm10.ensPep.tab # 55798 mm9.ensPep.tab # 55485 mm10.ensPep.tab # select out ensemblSource records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblSource.tab mm10.name.ensGene.txt \ | tr '[ ]' '[\t]' > mm10.ensemblSource.tab wc -l mm9.ensemblSource.tab mm10.ensemblSource.tab 95883 mm9.ensemblSource.tab 95533 mm10.ensemblSource.tab # select out ensemblToGeneName records that match the mm10 ensGene names: join -1 1 -2 1 -o "1.1,1.2" mm9.ensemblToGeneName.tab \ mm10.name.ensGene.txt | tr '[ ]' '[\t]' > mm10.ensemblToGeneName.tab wc -l mm9.ensemblToGeneName.tab mm10.ensemblToGeneName.tab # 95883 mm9.ensemblToGeneName.tab # 95533 mm10.ensemblToGeneName.tab hgPepPred mm10 tab ensPep mm10.ensPep.tab hgLoadSqlTab mm10 ensGtp ~/kent/src/hg/lib/ensGtp.sql mm10.ensGtp.tab sed -e "s/15/18/" ~/kent/src/hg/lib/ensemblSource.sql > ensemblSource.sql hgLoadSqlTab mm10 ensemblSource ensemblSource.sql mm10.ensemblSource.tab # find sizes for indexes NL=`awk '{print length($1)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` VL=`awk '{print length($2)}' mm10.ensemblToGeneName.tab | sort -rn | head -1` # construct sql definition with appropriate index sizes sed -e "s/ knownTo / ensemblToGeneName /; s/known gene/ensGen/; s/INDEX(name(12)/PRIMARY KEY(name($NL)/; s/value(12)/value($VL)/" \ ~/kent/src/hg/lib/knownTo.sql > ensemblToGeneName.sql hgLoadSqlTab mm10 ensemblToGeneName ensemblToGeneName.sql \ mm10.ensemblToGeneName.tab hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("mm10", "ensGene", "hiram", "65", now(), \ "lifted from mm9 ensGene 65", \ "lifted from mm9 ensGene 65", \ "dec2011" );' hgFixed ######################################################################### # Swap lastz Human hg19 (DONE - 2012-03-08 - Hiram) # original alignment to hg19 cd /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07 cat fb.hg19.chainMm10Link.txt # 1021265143 bases of 2897316137 (35.249%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.hg19.swap cd /hive/data/genomes/mm10/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzMm10.2012-03-07/DEF \ -swap -noLoadChainSplit -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m32.794s cat fb.mm10.chainHg19Link.txt # 1014045890 bases of 2652783500 (38.226%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.hg19.swap lastz.hg19 ######################################################################### # LASTZ RAT Rn4 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # Specially tuned blastz parameters from Webb Miller BLASTZ_O=600 BLASTZ_E=150 BLASTZ_Y=15000 BLASTZ_T=2 BLASTZ_K=4500 BLASTZ_Q=/scratch/data/blastz/human_chimp.v2.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn4 SEQ2_DIR=/scratch/data/rn4/rn4.2bit SEQ2_LEN=/scratch/data/rn4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzRn4.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn4Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # real 129m48.444s cat fb.mm10.chainRn4Link.txt # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn4.2012-03-08 lastz.rn4 # and the swap mkdir /hive/data/genomes/rn4/bed/blastz.mm10.swap cd /hive/data/genomes/rn4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn4.2012-03-08/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 71m10.645s cat fb.rn4.chainMm10Link.txt # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Gorilla gorGor3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 cat << '_EOF_' > DEF # gorilla vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gorilla GorGor3 SEQ2_DIR=/scratch/data/gorGor3/gorGor3.2bit SEQ2_LEN=/scratch/data/gorGor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10GorGor3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 625m17.180s cat fb.mm10.chainGorGor3Link.txt # 901610588 bases of 2652783500 (33.987%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGorGor3.2012-03-08 lastz.gorGor3 mkdir /hive/data/genomes/gorGor3/bed/blastz.mm10.swap cd /hive/data/genomes/gorGor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGorGor3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 91m3.616s cat fb.gorGor3.chainMm10Link.txt # 969595533 bases of 2822760080 (34.349%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gorGor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu1 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu1 SEQ2_DIR=/scratch/data/nomLeu1/nomLeu1.2bit SEQ2_LEN=/scratch/data/nomLeu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 556m26.589s cat fb.mm10.chainNomLeu1Link.txt # 905455766 bases of 2652783500 (34.132%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu1.2012-03-08 lastz.nomLeu1 mkdir /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu1.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 66m50.839s cat fb.nomLeu1.chainMm10Link.txt # 892362811 bases of 2756591777 (32.372%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Rhesus rheMac3 (DONE - 2012-03-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cd /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 cat << '_EOF_' > DEF # rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac3 SEQ2_DIR=/scratch/data/rheMac3/rheMac3.2bit SEQ2_LEN=/scratch/data/rheMac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10RheMac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 596m55.622s cat fb.mm10.chainRheMac3Link.txt # 900117108 bases of 2652783500 (33.931%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRheMac3.2012-03-08 lastz.rheMac3 mkdir /hive/data/genomes/rheMac3/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac3.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 69m5.839s cat fb.rheMac3.chainMm10Link.txt # 883164992 bases of 2639145830 (33.464%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rheMac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Baboon papHam1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 cat << '_EOF_' > DEF # baboon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Baboon PapHam1 SEQ2_DIR=/scratch/data/papHam1/papHam1.2bit SEQ2_LEN=/scratch/data/papHam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PapHam1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1138m52.716s cat fb.mm10.chainPapHam1Link.txt # 890718423 bases of 2652783500 (33.577%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPapHam1.2012-03-09 lastz.papHam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09 time doRecipBest.pl mm10 papHam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 899m48.908s mkdir /hive/data/genomes/papHam1/bed/blastz.mm10.swap cd /hive/data/genomes/papHam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapHam1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 548m15.438s cat fb.mm10.chainPapHam1Link.txt # 878016290 bases of 2741867288 (32.023%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/papHam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap ponAbe2 lastz (DONE - 2012-03-09 - Hiram) # original alignment result: cd /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08 cat fb.ponAbe2.chainMm10Link.txt # 946932454 bases of 3093572278 (30.610%) in intersection # and the swap mkdir /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap cd /hive/data/genomes/mm10/bed/blastz.ponAbe2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/ponAbe2/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m38.550s cat fb.mm10.chainPonAbe2Link.txt # 915093866 bases of 2652783500 (34.496%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s blastz.ponAbe2.swap lastz.ponAbe2 ############################################################################## # LASTZ Squirrel monkey saiBol1 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 cat << '_EOF_' > DEF # squirrel monkey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Squirrel monkey SaiBol1 SEQ2_DIR=/hive/data/genomes/saiBol1/saiBol1.2bit SEQ2_LEN=/hive/data/genomes/saiBol1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SaiBol1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 538m42.643s cat fb.mm10.chainSaiBol1Link.txt # 857872391 bases of 2652783500 (32.339%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSaiBol1.2012-03-09 lastz.saiBol1 mkdir /hive/data/genomes/saiBol1/bed/blastz.mm10.swap cd /hive/data/genomes/saiBol1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSaiBol1.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 59m36.306s cat fb.saiBol1.chainMm10Link.txt # 838457857 bases of 2477131095 (33.848%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/saiBol1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Marmoset calJac3 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cd /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 cat << '_EOF_' > DEF # marmoset vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Marmoset monkey CalJac3 SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit SEQ2_LEN=/scratch/data/calJac3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10CalJac3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 529m39.657s cat fb.mm10.chainCalJac3Link.txt # 860830771 bases of 2652783500 (32.450%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCalJac3.2012-03-09 lastz.calJac3 mkdir /hive/data/genomes/calJac3/bed/blastz.mm10.swap cd /hive/data/genomes/calJac3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCalJac3.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 67m21.635s cat fb.calJac3.chainMm10Link.txt # 861565545 bases of 2752505800 (31.301%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/calJac3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Chimp PanTro4 (DONE - 2012-03-09 - Hiram) mkdir /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-09 cat << '_EOF_' > DEF # chimp vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Chimp PanTro4 SEQ2_DIR=/hive/data/genomes/panTro4/panTro4.2bit SEQ2_LEN=/hive/data/genomes/panTro4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10PanTro4 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 682m53.046s cat fb.mm10.chainPanTro4Link.txt # 919836299 bases of 2652783500 (34.674%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPanTro4.2012-03-09 lastz.panTro4 mkdir /hive/data/genomes/panTro4/bed/blastz.mm10.swap cd /hive/data/genomes/panTro4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro4.2012-03-09/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 73m23.855s cat fb.panTro4.chainMm10Link.txt # 926540065 bases of 2902338967 (31.924%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/panTro4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tarsier tarSyr1 (DONE - 2012-03-10 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 cat << '_EOF_' > DEF # tarsier vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tarsier TarSyr1 SEQ2_DIR=/scratch/data/tarSyr1/tarSyr1.2bit SEQ2_LEN=/scratch/data/tarSyr1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10TarSyr1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2457m45.759s cat fb.mm10.chainTarSyr1Link.txt # 651517559 bases of 2652783500 (24.560%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTarSyr1.2012-03-10 lastz.tarSyr1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10 time doRecipBest.pl mm10 tarSyr1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1176m19.336s mkdir /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr1.2012-03-10/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 746m30.852s cat fb.tarSyr1.chainMm10Link.txt # 691746721 bases of 2768536343 (24.986%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tarSyr1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # Swap chimp panTro3 to Mm10 (DONE - 2012-03-12 - Hiram) # original alignment on panTro3 cd /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08 cat fb.panTro3.chainMm10Link.txt # 929073028 bases of 2900529764 (32.031%) in intersection # and this swap: mkdir /hive/data/genomes/mm10/bed/blastz.panTro3.swap cd /hive/data/genomes/mm10/bed/blastz.panTro3.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/panTro3/bed/lastzMm10.2012-03-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 68m46.408s cat fb.mm10.chainPanTro3Link.txt # 922491113 bases of 2652783500 (34.774%) in intersection ############################################################################## # LASTZ bushbaby otoGar3 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # bushbaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: bushbaby OtoGar3 SEQ2_DIR=/hive/data/genomes/otoGar3/otoGar3.2bit SEQ2_LEN=/hive/data/genomes/otoGar3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10OtoGar3 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 757m32.438s cat fb.mm10.chainOtoGar3Link.txt # 790408953 bases of 2652783500 (29.795%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOtoGar3.2012-03-13 lastz.otoGar3 mkdir /hive/data/genomes/otoGar3/bed/blastz.mm10.swap cd /hive/data/genomes/otoGar3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOtoGar3.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 61m18.952s cat fb.otoGar3.chainMm10Link.txt # 776907989 bases of 2359530453 (32.926%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/otoGar3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ mouse lemur micMur1 (DONE - 2012-03-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # mouse lemur vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: mouse lemur MicMur1 SEQ2_DIR=/scratch/data/micMur1/micMur1.2bit SEQ2_LEN=/scratch/data/micMur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10MicMur1 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 687m41.863s cat fb.mm10.chainMicMur1Link.txt # 706607444 bases of 2652783500 (26.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMicMur1.2012-03-13 lastz.micMur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13 time doRecipBest.pl mm10 micMur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 877m18.105s mkdir /hive/data/genomes/micMur1/bed/blastz.mm10.swap cd /hive/data/genomes/micMur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMicMur1.2012-03-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 116m54.411s cat fb.micMur1.chainMm10Link.txt # 696025630 bases of 1852394361 (37.574%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/micMur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ squirrel speTri2 (DONE - 2012-03-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # squirrel vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: squirrel SpeTri2 SEQ2_DIR=/hive/data/genomes/speTri2/speTri2.2bit SEQ2_LEN=/hive/data/genomes/speTri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10SpeTri2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 935m27.893s cat fb.mm10.chainSpeTri2Link.txt # 907715417 bases of 2652783500 (34.217%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSpeTri2.2012-03-15 lastz.speTri2 mkdir /hive/data/genomes/speTri2/bed/blastz.mm10.swap cd /hive/data/genomes/speTri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSpeTri2.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m41.819s # real 116m54.411s cat fb.speTri2.chainMm10Link.txt # 906956512 bases of 2311060300 (39.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/speTri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ kangaroo rat dipOrd1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10DipOrd1 mkdir /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # kangaroo rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: kangaroo rat DipOrd1 SEQ2_DIR=/scratch/data/dipOrd1/dipOrd1.2bit SEQ2_LEN=/scratch/data/dipOrd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 867m19.972s cat fb.mm10.chainDipOrd1Link.txt # 516232678 bases of 2652783500 (19.460%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDipOrd1.2012-03-15 lastz.dipOrd1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15 time doRecipBest.pl mm10 dipOrd1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 914m20.405s mkdir /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap cd /hive/data/genomes/dipOrd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDipOrd1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 115m1.497s cat fb.dipOrd1.chainMm10Link.txt # 507580668 bases of 1844961421 (27.512%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dipOrd1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Naked mole-rat hetGla1 (DONE - 2012-03-15 - Hiram) # establish a screen to control this job screen -S mm10HetGla1 mkdir /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 cd /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # Naked mole-rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Naked mole-rat HetGla1 SEQ2_DIR=/scratch/data/hetGla1/hetGla1.2bit SEQ2_LEN=/scratch/data/hetGla1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 745m15.097s cat fb.mm10.chainHetGla1Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla1.2012-03-15 lastz.hetGla1 mkdir /hive/data/genomes/hetGla1/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla1.2012-03-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m26.471s cat fb.hetGla1.chainMm10Link.txt # 885195861 bases of 2430064805 (36.427%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ horse equCab2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EquCab2 mkdir /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # horse vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: horse EquCab2 SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit SEQ2_LEN=/scratch/data/equCab2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 566m34.024s cat fb.mm10.chainEquCab2Link.txt # 912967841 bases of 2652783500 (34.415%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEquCab2.2012-03-16 lastz.equCab2 mkdir /hive/data/genomes/equCab2/bed/blastz.mm10.swap cd /hive/data/genomes/equCab2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 87m2.261s cat fb.equCab2.chainMm10Link.txt # 901995882 bases of 2428790173 (37.138%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/equCab2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ guinea pig cavPor3 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CavPor3 mkdir /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: guinea pig CavPor3 SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1523m35.729s cat fb.mm10.chainCavPor3Link.txt # 754642254 bases of 2652783500 (28.447%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCavPor3.2012-03-16 lastz.cavPor3 mkdir /hive/data/genomes/cavPor3/bed/blastz.mm10.swap cd /hive/data/genomes/cavPor3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavPor3.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 80m23.870s cat fb.cavPor3.chainMm10Link.txt # 775452752 bases of 2663369733 (29.115%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cavPor3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10VicPac1 mkdir /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac1 SEQ2_DIR=/scratch/data/vicPac1/vicPac1.2bit SEQ2_LEN=/scratch/data/vicPac1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2049m38.674s cat fb.mm10.chainVicPac1Link.txt # 600477253 bases of 2652783500 (22.636%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzVicPac1.2012-03-16 lastz.vicPac1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16 time doRecipBest.pl mm10 vicPac1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 824m37.107s mkdir /hive/data/genomes/vicPac1/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 159m21.952s cat fb.vicPac1.chainMm10Link.txt # 610885692 bases of 1922910435 (31.769%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru1 mkdir /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru1 SEQ2_DIR=/scratch/data/turTru1/turTru1.2bit SEQ2_LEN=/scratch/data/turTru1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1484m14.609s cat fb.mm10.chainTurTru1Link.txt # 762961671 bases of 2652783500 (28.761%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru1.2012-03-16 lastz.turTru1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16 time doRecipBest.pl mm10 turTru1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 733m37.272s mkdir /hive/data/genomes/turTru1/bed/blastz.mm10.swap cd /hive/data/genomes/turTru1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 79m38.703s cat fb.turTru1.chainMm10Link.txt # 744359707 bases of 2298444090 (32.385%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tree shrew tupBel1 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TupBel1 mkdir /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tree shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tree shrew TupBel1 SEQ2_DIR=/scratch/data/tupBel1/tupBel1.2bit SEQ2_LEN=/scratch/data/tupBel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1731m30.449s cat fb.mm10.chainTupBel1Link.txt # 524337666 bases of 2652783500 (19.766%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTupBel1.2012-03-16 lastz.tupBel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16 time doRecipBest.pl mm10 tupBel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1090m30.429s mkdir /hive/data/genomes/tupBel1/bed/blastz.mm10.swap cd /hive/data/genomes/tupBel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupBel1.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 136m7.163s cat fb.tupBel1.chainMm10Link.txt # 537379661 bases of 2137225476 (25.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tupBel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pig susScr2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr2 mkdir /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr2 SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit SEQ2_LEN=/scratch/data/susScr2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1272m57.727s cat fb.mm10.chainSusScr2Link.txt # 616716602 bases of 2652783500 (23.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr2.2012-03-16 lastz.susScr2 mkdir /hive/data/genomes/susScr2/bed/blastz.mm10.swap cd /hive/data/genomes/susScr2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m47.465s cat fb.susScr2.chainMm10Link.txt # 656498040 bases of 2231298548 (29.422%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rabbit oryCun2 (DONE - 2012-03-16 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryCun2 mkdir /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 cd /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rabbit vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rabbit OryCun2 SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1412m58.641s cat fb.mm10.chainOryCun2Link.txt # 669778489 bases of 2652783500 (25.248%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryCun2.2012-03-16 lastz.oryCun2 mkdir /hive/data/genomes/oryCun2/bed/blastz.mm10.swap cd /hive/data/genomes/oryCun2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryCun2.2012-03-16/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 64m40.959s cat fb.oryCun2.chainMm10Link.txt # 668643668 bases of 2604023284 (25.677%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryCun2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sloth choHof1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChoHof1 mkdir /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sloth vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sloth ChoHof1 SEQ2_DIR=/scratch/data/choHof1/choHof1.2bit SEQ2_LEN=/scratch/data/choHof1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # rebooted hgwdev during first swarm run, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 & # Elapsed time: 65m26s cat fb.mm10.chainChoHof1Link.txt # 477994856 bases of 2652783500 (18.019%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChoHof1.2012-03-19 lastz.choHof1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19 time doRecipBest.pl mm10 choHof1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m56.481s mkdir /hive/data/genomes/choHof1/bed/blastz.mm10.swap cd /hive/data/genomes/choHof1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChoHof1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 1613m3.348s cat fb.choHof1.chainMm10Link.txt # 488047499 bases of 2060419685 (23.687%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/choHof1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ megabat pteVam1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PteVam1 mkdir /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # megabat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: megabat PteVam1 SEQ2_DIR=/scratch/data/pteVam1/pteVam1.2bit SEQ2_LEN=/scratch/data/pteVam1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1843m33.186s cat fb.mm10.chainPteVam1Link.txt # 725414059 bases of 2652783500 (27.345%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPteVam1.2012-03-19 lastz.pteVam1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19 time doRecipBest.pl mm10 pteVam1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 743m57.901s mkdir /hive/data/genomes/pteVam1/bed/blastz.mm10.swap cd /hive/data/genomes/pteVam1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPteVam1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 75m35s cat fb.pteVam1.chainMm10Link.txt # 710519911 bases of 1839436660 (38.627%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/pteVam1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ elephant loxAfr3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LoxAfr3 mkdir /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # elephant vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: elephant LoxAfr3 SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1848m11.111s cat fb.mm10.chainLoxAfr3Link.txt # 685029753 bases of 2652783500 (25.823%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLoxAfr3.2012-03-19 lastz.loxAfr3 mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap cd /hive/data/genomes/loxAfr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLoxAfr3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 73m14s cat fb.loxAfr3.chainMm10Link.txt # 674108752 bases of 3118565340 (21.616%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/loxAfr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat4 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat4 mkdir /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat4 SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit SEQ2_LEN=/scratch/data/felCat4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2010m48.963s cat fb.mm10.chainFelCat4Link.txt # 637531191 bases of 2652783500 (24.033%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat4.2012-03-19 lastz.felCat4 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19 time doRecipBest.pl mm10 felCat4 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1135m12.207s mkdir /hive/data/genomes/felCat4/bed/blastz.mm10.swap cd /hive/data/genomes/felCat4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat4.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 88m12s cat fb.felCat4.chainMm10Link.txt # 616167655 bases of 1990635005 (30.953%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat4/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ panda ailMel1 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AilMel1 mkdir /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # panda vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: panda AilMel1 SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium # real 1914m15.921s cat fb.mm10.chainAilMel1Link.txt # 821806974 bases of 2652783500 (30.979%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAilMel1.2012-03-19 lastz.ailMel1 mkdir /hive/data/genomes/ailMel1/bed/blastz.mm10.swap cd /hive/data/genomes/ailMel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAilMel1.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 65m50s cat fb.ailMel1.chainMm10Link.txt # 798482731 bases of 2245312831 (35.562%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ailMel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dog canFam3 (DONE - 2012-03-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CanFam3 mkdir /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 cd /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dog CanFam3 SEQ2_DIR=/hive/data/genomes/canFam3/canFam3.2bit SEQ2_LEN=/hive/data/genomes/canFam3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # forgot to copy to the log time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1883m21.850s cat fb.mm10.chainCanFam3Link.txt # 773114990 bases of 2652783500 (29.144%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCanFam3.2012-03-19 lastz.canFam3 mkdir /hive/data/genomes/canFam3/bed/blastz.mm10.swap cd /hive/data/genomes/canFam3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCanFam3.2012-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # Elapsed time: 63m22s cat fb.canFam3.chainMm10Link.txt # 756678903 bases of 2392715236 (31.624%) in intersectio # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/canFam3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov2 mkdir /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov2 SEQ2_DIR=/scratch/data/dasNov2/dasNov2.2bit SEQ2_LEN=/scratch/data/dasNov2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=800 BASE=/hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2655m49.904s cat fb.mm10.chainDasNov2Link.txt # 451070039 bases of 2652783500 (17.004%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov2.2012-03-21 lastz.dasNov2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21 time doRecipBest.pl mm10 dasNov2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1163m1.023s mkdir /hive/data/genomes/dasNov2/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 201m9.701s cat fb.dasNov2.chainMm10Link.txt # 461142417 bases of 2371493872 (19.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ microbat myoLuc2 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MyoLuc2 mkdir /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # microbat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: microbat MyoLuc2 SEQ2_DIR=/scratch/data/myoLuc2/myoLuc2.2bit SEQ2_LEN=/scratch/data/myoLuc2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1033m38.184s cat fb.mm10.chainMyoLuc2Link.txt # 646292112 bases of 2652783500 (24.363%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMyoLuc2.2012-03-21 lastz.myoLuc2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21 time doRecipBest.pl mm10 myoLuc2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 29m16.249s mkdir /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap cd /hive/data/genomes/myoLuc2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMyoLuc2.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 54m5.607s cat fb.myoLuc2.chainMm10Link.txt # 661704053 bases of 1966419868 (33.650%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/myoLuc2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cow bosTau7 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10BosTau7 mkdir /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau7 SEQ2_DIR=/scratch/data/bosTau7/bosTau7.2bit SEQ2_LEN=/scratch/data/bosTau7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1151m20.445s cat fb.mm10.chainBosTau7Link.txt # 696498363 bases of 2652783500 (26.255%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau7.2012-03-21 lastz.bosTau7 mkdir /hive/data/genomes/bosTau7/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau7.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 77m58.759s cat fb.bosTau7.chainMm10Link.txt # 711923052 bases of 2804673174 (25.383%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau7/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ sheep oviAri1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OviAri1 mkdir /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # sheep vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: sheep OviAri1 SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 892m33.068s cat fb.mm10.chainOviAri1Link.txt # 406955832 bases of 2652783500 (15.341%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOviAri1.2012-03-21 lastz.oviAri1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21 time doRecipBest.pl mm10 oviAri1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1183m43.488s mkdir /hive/data/genomes/oviAri1/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 30m5.554s cat fb.oviAri1.chainMm10Link.txt # 383499897 bases of 1201271277 (31.925%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oviAri1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ rock hyrax proCap1 (DONE - 2012-03-21 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ProCap1 mkdir /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # rock hyrax vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: rock hyrax ProCap1 SEQ2_DIR=/scratch/data/proCap1/proCap1.2bit SEQ2_LEN=/scratch/data/proCap1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=600 BASE=/hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2859m51.317s cat fb.mm10.chainProCap1Link.txt # 401804601 bases of 2652783500 (15.147%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzProCap1.2012-03-21 lastz.proCap1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21 time doRecipBest.pl mm10 proCap1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1083m57.139s mkdir /hive/data/genomes/proCap1/bed/blastz.mm10.swap cd /hive/data/genomes/proCap1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCap1.2012-03-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 165m10.285s cat fb.proCap1.chainMm10Link.txt # 390409777 bases of 2407847681 (16.214%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/proCap1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ pika ochPri2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OchPri2 mkdir /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pika vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pika OchPri2 SEQ2_DIR=/scratch/data/ochPri2/ochPri2.2bit SEQ2_LEN=/scratch/data/ochPri2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2578m43.648s cat fb.mm10.chainOchPri2Link.txt # 385766335 bases of 2652783500 (14.542%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOchPri2.2012-03-22 lastz.ochPri2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22 time doRecipBest.pl mm10 ochPri2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1036m29.080s mkdir /hive/data/genomes/ochPri2/bed/blastz.mm10.swap cd /hive/data/genomes/ochPri2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOchPri2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 103m34.369s cat fb.ochPri2.chainMm10Link.txt # 382959642 bases of 1923624051 (19.908%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ochPri2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ hedgehog eriEur1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EriEur1 mkdir /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # hedgehog vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: hedgehog EriEur1 SEQ2_DIR=/scratch/data/eriEur1/eriEur1.2bit SEQ2_LEN=/scratch/data/eriEur1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3006m41.470s cat fb.mm10.chainEriEur1Link.txt # 261447061 bases of 2652783500 (9.856%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEriEur1.2012-03-22 lastz.eriEur1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22 time doRecipBest.pl mm10 eriEur1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1171m41.349s mkdir /hive/data/genomes/eriEur1/bed/blastz.mm10.swap cd /hive/data/genomes/eriEur1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEriEur1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 241m24.183s cat fb.eriEur1.chainMm10Link.txt # 261605017 bases of 2133134836 (12.264%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/eriEur1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tenrec echTel1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EchTel1 mkdir /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tenrec vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tenrec EchTel1 SEQ2_DIR=/scratch/data/echTel1/echTel1.2bit SEQ2_LEN=/scratch/data/echTel1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 3047m28.723s cat fb.mm10.chainEchTel1Link.txt # 290413150 bases of 2652783500 (10.947%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEchTel1.2012-03-22 lastz.echTel1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22 time doRecipBest.pl mm10 echTel1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1201m39.275s mkdir /hive/data/genomes/echTel1/bed/blastz.mm10.swap cd /hive/data/genomes/echTel1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEchTel1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 269m52.619s cat fb.echTel1.chainMm10Link.txt # 298082139 bases of 2111581369 (14.117%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ shrew sorAra1 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SorAra1 mkdir /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: shrew SorAra1 SEQ2_DIR=/scratch/data/sorAra1/sorAra1.2bit SEQ2_LEN=/scratch/data/sorAra1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2600m22.528s cat fb.mm10.chainSorAra1Link.txt # 248874412 bases of 2652783500 (9.382%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSorAra1.2012-03-22 lastz.sorAra1 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22 time doRecipBest.pl mm10 sorAra1 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1074m22.651s mkdir /hive/data/genomes/sorAra1/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra1.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 141m38.806s cat fb.sorAra1.chainMm10Link.txt # 248692550 bases of 1832864697 (13.569%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra1/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ wallaby macEug2 (DONE - 2012-03-22 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MacEug2 mkdir /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # wallaby vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: wallaby MacEug2 SEQ2_DIR=/scratch/data/macEug2/macEug2.2bit SEQ2_LEN=/scratch/data/macEug2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2893m50.341s cat fb.mm10.chainMacEug2Link.txt # 115481931 bases of 2652783500 (4.353%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMacEug2.2012-03-22 lastz.macEug2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22 time doRecipBest.pl mm10 macEug2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 1032m58.798s mkdir /hive/data/genomes/macEug2/bed/blastz.mm10.swap cd /hive/data/genomes/macEug2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMacEug2.2012-03-22/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 130m7.404s cat fb.macEug2.chainMm10Link.txt # 112811810 bases of 2536076957 (4.448%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/macEug2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ RAT Rn5 (DONE - 2012-03-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Rn5 mkdir /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cd /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 cat << '_EOF_' > DEF # mouse vs rat BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # From tuning experiment between mouse chr12:15000000-25000000 and # rat chr6:38000000-48000000 BLASTZ_O=600 BLASTZ_E=55 BLASTZ_Y=5000 BLASTZ_T=2 BLASTZ_K=3000 BLASTZ_L=3000 BLASTZ_Q=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/mouse_rat_2.q BLASTZ_ABRIDGE_REPEATS=1 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_SMSK=/scratch/data/mm10/notInRat SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn5 SEQ2_DIR=/hive/data/genomes/rn5/rn5.2bit SEQ2_LEN=/hive/data/genomes/rn5/chrom.sizes SEQ2_SMSK=/hive/data/genomes/rn5/bed/linSpecRep/notInMouse SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzRn5.2012-03-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S rn5Mm10 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > do.log 2>&1 & # broken lastz run when SMSK files did not exist for some of the # Rn5 contigs - made empty files for those and completed, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > cat.log 2>&1 & # real 285m28.458s cat fb.mm10.chainRn5Link.txt # 1786721927 bases of 2652783500 (67.353%) in intersection # FYI: rn4 was: # 1449612208 bases of 2652783500 (54.645%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzRn5.2012-03-23 lastz.rn5 # and the swap mkdir /hive/data/genomes/rn5/bed/blastz.mm10.swap cd /hive/data/genomes/rn5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn5.2012-03-23/DEF \ -swap -bigClusterHub=swarm -chainMinScore=5000 -chainLinearGap=medium \ -noLoadChainSplit -syntenicNet -workhorse=hgwdev \ -smallClusterHub=encodek > swap.log 2>&1 & # real 121m21.029s cat fb.rn5.chainMm10Link.txt # 1808154679 bases of 2572853723 (70.278%) in intersection # FYI, rn4 was: # 1449012636 bases of 2571531505 (56.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/rn5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ Manatee triMan1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TriMan1 mkdir /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # manatee vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: manatee TriMan1 SEQ2_DIR=/hive/data/genomes/triMan1/triMan1.2bit SEQ2_LEN=/hive/data/genomes/triMan1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1455m24.772s cat fb.mm10.chainTriMan1Link.txt # 704207702 bases of 2652783500 (26.546%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTriMan1.2012-03-29 lastz.triMan1 mkdir /hive/data/genomes/triMan1/bed/blastz.mm10.swap cd /hive/data/genomes/triMan1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTriMan1.2012-03-29/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m33.530s cat fb.triMan1.chainMm10Link.txt # 682557025 bases of 2769099677 (24.649%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/triMan1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Opossum monDom5 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MonDom5 mkdir /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. opossum BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Opossum monDom5 SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit SEQ2_LEN=/scratch/data/monDom5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # Can't do this when there are only the single small set of chroms time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1792m40.071s cat fb.mm10.chainMonDom5Link.txt # 254245903 bases of 2652783500 (9.584%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMonDom5.2012-03-29 lastz.monDom5 # and for the swap mkdir /hive/data/genomes/monDom5/bed/blastz.mm10.swap cd /hive/data/genomes/monDom5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMonDom5.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 73m49.230s cat fb.monDom5.chainMm10Link.txt # 252291401 bases of 3501660299 (7.205%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/monDom5/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz Tasmanian Devil sarHar1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SarHar1 mkdir /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. tasmanian devil BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tasmanian devil sarHar1 SEQ2_DIR=/scratch/data/sarHar1/sarHar1.2bit SEQ2_LEN=/scratch/data/sarHar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1208m55.866s cat fb.mm10.chainSarHar1Link.txt # 224935746 bases of 2652783500 (8.479%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSarHar1.2012-03-29 lastz.sarHar1 # and for the swap mkdir /hive/data/genomes/sarHar1/bed/blastz.mm10.swap cd /hive/data/genomes/sarHar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSarHar1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 45m53.015s cat fb.sarHar1.chainMm10Link.txt # 231249436 bases of 2931539702 (7.888%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sarHar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz budgerigar melUnd1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelUnd1 mkdir /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. budgerigar BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: budgerigar melUnd1 SEQ2_DIR=/hive/data/genomes/melUnd1/melUnd1.2bit SEQ2_LEN=/hive/data/genomes/melUnd1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 883m58.198s cat fb.mm10.chainMelUnd1Link.txt # 95217653 bases of 2652783500 (3.589%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelUnd1.2012-03-29 lastz.melUnd1 # and for the swap mkdir /hive/data/genomes/melUnd1/bed/blastz.mm10.swap cd /hive/data/genomes/melUnd1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelUnd1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 9m9.260s cat fb.melUnd1.chainMm10Link.txt # 79867911 bases of 1086614815 (7.350%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melUnd1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz platypus ornAna1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OrnAna1 mkdir /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. platypus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: platypus ornAna1 SEQ2_DIR=/scratch/data/ornAna1/ornAna1.2bit SEQ2_LEN=/scratch/data/ornAna1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=400 BASE=/hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1264m1.056s cat fb.mm10.chainOrnAna1Link.txt # 141873792 bases of 2652783500 (5.348%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOrnAna1.2012-03-29 lastz.ornAna1 # and for the swap mkdir /hive/data/genomes/ornAna1/bed/blastz.mm10.swap cd /hive/data/genomes/ornAna1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOrnAna1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 49m45.308s cat fb.ornAna1.chainMm10Link.txt # 135101083 bases of 1842236818 (7.334%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/ornAna1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turtle chrPic1 (DONE - 2012-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChrPic1 mkdir /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cd /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 cat << '_EOF_' > DEF # Mouse vs. turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turtle chrPic1 SEQ2_DIR=/hive/data/genomes/chrPic1/chrPic1.2bit SEQ2_LEN=/hive/data/genomes/chrPic1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 1243m2.518s cat fb.mm10.chainChrPic1Link.txt # 125499965 bases of 2652783500 (4.731%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzChrPic1.2012-03-29 lastz.chrPic1 # and for the swap mkdir /hive/data/genomes/chrPic1/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic1.2012-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 19m26.835s cat fb.chrPic1.chainMm10Link.txt # 118436838 bases of 2158289746 (5.488%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/chrPic1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz chicken galGal4 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GalGal4 mkdir /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal4 SEQ2_DIR=/hive/data/genomes/galGal4/galGal4.2bit SEQ2_LEN=/hive/data/genomes/galGal4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 109m21.068s # broken swarm cluster, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 57m24.155s cat fb.mm10.chainGalGal4Link.txt # 97510773 bases of 2652783500 (3.676%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGalGal4.2012-04-02 lastz.galGal4 # and for the swap mkdir /hive/data/genomes/galGal4/bed/blastz.mm10.swap cd /hive/data/genomes/galGal4/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal4.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 95m50.996s cat fb.galGal4.chainMm10Link.txt # 83660034 bases of 1032854810 (8.100%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/galGal4/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebra finch taeGut1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TaeGut1 mkdir /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebra finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebra finch taeGut1 SEQ2_DIR=/scratch/data/taeGut1/taeGut1.2bit SEQ2_LEN=/scratch/data/taeGut1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=5 BASE=/hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 106m11.612s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 29m11.090s cat fb.mm10.chainTaeGut1Link.txt # 95469341 bases of 2652783500 (3.599%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTaeGut1.2012-04-02 lastz.taeGut1 # and for the swap mkdir /hive/data/genomes/taeGut1/bed/blastz.mm10.swap cd /hive/data/genomes/taeGut1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTaeGut1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 37m17.483s cat fb.taeGut1.chainMm10Link.txt # 89312133 bases of 1222864691 (7.304%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/taeGut1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lizard anoCar2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10AnoCar2 mkdir /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lizard BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lizard anoCar2 SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 103m17.133s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 43m2.183s cat fb.mm10.chainAnoCar2Link.txt # 88356459 bases of 2652783500 (3.331%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzAnoCar2.2012-04-02 lastz.anoCar2 # and for the swap mkdir /hive/data/genomes/anoCar2/bed/blastz.mm10.swap cd /hive/data/genomes/anoCar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAnoCar2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 97m50.599s cat fb.anoCar2.chainMm10Link.txt # 84865552 bases of 1701353770 (4.988%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/anoCar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz turkey melGal1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelGal1 mkdir /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turkey melGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 101m17.902s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 20m47.771s cat fb.mm10.chainMelGal1Link.txt # 93132953 bases of 2652783500 (3.511%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzMelGal1.2012-04-02 lastz.melGal1 # and for the swap mkdir /hive/data/genomes/melGal1/bed/blastz.mm10.swap cd /hive/data/genomes/melGal1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMelGal1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 88m39.591s cat fb.melGal1.chainMm10Link.txt # 76848161 bases of 935922386 (8.211%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/melGal1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz frog xenTro3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenTro3 mkdir /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: frog xenTro3 SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 99m10.611s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 37m52.678s cat fb.mm10.chainXenTro3Link.txt # 82900338 bases of 2652783500 (3.125%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzXenTro3.2012-04-02 lastz.xenTro3 # and for the swap mkdir /hive/data/genomes/xenTro3/bed/blastz.mm10.swap cd /hive/data/genomes/xenTro3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenTro3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 53m19.485s cat fb.xenTro3.chainMm10Link.txt # 90345130 bases of 1358334882 (6.651%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/xenTro3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz coelacanth latCha1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10LatCha1 mkdir /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. coelacanth BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: coelacanth latCha1 SEQ2_DIR=/hive/data/genomes/latCha1/latCha1.2bit SEQ2_LEN=/hive/data/genomes/latCha1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 95m34.477s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 214m7.324s cat fb.mm10.chainLatCha1Link.txt # 72036116 bases of 2652783500 (2.715%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzLatCha1.2012-04-02 lastz.latCha1 # and for the swap mkdir /hive/data/genomes/latCha1/bed/blastz.mm10.swap cd /hive/data/genomes/latCha1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzLatCha1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 14m44.600s cat fb.latCha1.chainMm10Link.txt # 73798131 bases of 2183592768 (3.380%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/latCha1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz atlantic cod gadMor1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GadMor1 mkdir /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. atlantic cod BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: atlantic cod gadMor1 SEQ2_DIR=/hive/data/genomes/gadMor1/gadMor1.2bit SEQ2_LEN=/hive/data/genomes/gadMor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=700 BASE=/hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 91m23.642s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 39m41.194s cat fb.mm10.chainGadMor1Link.txt # 45795692 bases of 2652783500 (1.726%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGadMor1.2012-04-02 lastz.gadMor1 # and for the swap mkdir /hive/data/genomes/gadMor1/bed/blastz.mm10.swap cd /hive/data/genomes/gadMor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGadMor1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 62m58.963s cat fb.gadMor1.chainMm10Link.txt # 41406507 bases of 608038597 (6.810%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gadMor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz nile tilapia oreNil1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil1 mkdir /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil1 SEQ2_DIR=/scratch/data/oreNil1/oreNil1.2bit SEQ2_LEN=/scratch/data/oreNil1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 89m6.727s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 24m3.960s cat fb.mm10.chainOreNil1Link.txt # 51915568 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil1.2012-04-02 lastz.oreNil1 # and for the swap mkdir /hive/data/genomes/oreNil1/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 90m55.298s cat fb.oreNil1.chainMm10Link.txt # 49709461 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz stickleback gasAcu1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10GasAcu1 mkdir /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. stickleback BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: stickleback gasAcu1 SEQ2_DIR=/scratch/data/gasAcu1/gasAcu1.2bit SEQ2_LEN=/scratch/data/gasAcu1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=1 BASE=/hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 87m5.963s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 9m49.199s cat fb.mm10.chainGasAcu1Link.txt # 53469711 bases of 2652783500 (2.016%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGasAcu1.2012-04-02 lastz.gasAcu1 # and for the swap mkdir /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap cd /hive/data/genomes/gasAcu1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGasAcu1.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 12m58.072s cat fb.gasAcu1.chainMm10Link.txt # 48802831 bases of 446627861 (10.927%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/gasAcu1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz fugu fr3 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10Fr3 mkdir /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. fugu BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: fugu fr3 SEQ2_DIR=/scratch/data/fr3/fr3.2bit SEQ2_LEN=/scratch/data/fr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFr3.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 84m37.070s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 171m16.627s cat fb.mm10.chainFr3Link.txt # 47460021 bases of 2652783500 (1.789%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFr3.2012-04-02 lastz.fr3 # and for the swap mkdir /hive/data/genomes/fr3/bed/blastz.mm10.swap cd /hive/data/genomes/fr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFr3.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m13.151s cat fb.fr3.chainMm10Link.txt # 42586058 bases of 350961831 (12.134%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/fr3/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz tetraodon tetNig2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TetNig2 mkdir /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. tetraodon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tetraodon tetNig2 SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 13m21.638s cat fb.mm10.chainTetNig2Link.txt # 46035322 bases of 2652783500 (1.735%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTetNig2.2012-04-02 lastz.tetNig2 # and for the swap mkdir /hive/data/genomes/tetNig2/bed/blastz.mm10.swap cd /hive/data/genomes/tetNig2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTetNig2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m24.115s cat fb.tetNig2.chainMm10Link.txt # 41242926 bases of 302314788 (13.642%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/tetNig2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz zebrafish danRer7 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DanRer7 mkdir /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebrafish danRer7 SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit SEQ2_LEN=/scratch/data/danRer7/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 80m32.118s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 40m27.762s cat fb.mm10.chainDanRer7Link.txt # 69028912 bases of 2652783500 (2.602%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDanRer7.2012-04-02 lastz.danRer7 # and for the swap mkdir /hive/data/genomes/danRer7/bed/blastz.mm10.swap cd /hive/data/genomes/danRer7/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer7.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 109m49.939s cat fb.danRer7.chainMm10Link.txt # 72001768 bases of 1409770109 (5.107%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/danRer7/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz medaka oryLat2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OryLat2 mkdir /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. medaka BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: medaka oryLat2 SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 78m53.408s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # real 113m29.462s cat fb.mm10.chainOryLat2Link.txt # 51344841 bases of 2652783500 (1.936%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOryLat2.2012-04-02 lastz.oryLat2 # and for the swap mkdir /hive/data/genomes/oryLat2/bed/blastz.mm10.swap cd /hive/data/genomes/oryLat2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOryLat2.2012-04-02/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m52.846s cat fb.oryLat2.chainMm10Link.txt # 45954178 bases of 700386597 (6.561%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oryLat2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz lamprey petMar1 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PetMar1 mkdir /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cd /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 cat << '_EOF_' > DEF # Mouse vs. lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: lamprey petMar1 SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit SEQ2_LEN=/scratch/data/petMar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 77m3.923s # broken swarm, continuing: time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=cat `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > cat.log 2>&1 & # missing qRepeats specification rm axtChain/mm10.petMar1.net time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -qRepeats=windowmaskerSdust -continue=load `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > load.log 2>&1 & # real 6m31.527s cat fb.mm10.chainPetMar1Link.txt # 29205053 bases of 2652783500 (1.101%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar1.2012-04-02 lastz.petMar1 # and for the swap mkdir /hive/data/genomes/petMar1/bed/blastz.mm10.swap cd /hive/data/genomes/petMar1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar1.2012-04-02/DEF \ -qRepeats=windowmaskerSdust -workhorse=hgwdev \ -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 17m40.196s cat fb.petMar1.chainMm10Link.txt # 26274715 bases of 831696438 (3.159%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### ## 60-Way Multiz (DONE - 2011-09-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way # from the 62-way in the source tree, do not need aliMis1 and croPor1: /cluster/bin/phast/tree_doctor --prune ailMis1,croPor1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/62way.nh > 60way.nh # note, newer assemblies: susScr3, dasNov3, felCat5, hetGla2, turTru2, # nomLeu2, oreNil2 # what that looks like: cat 60way.nh # (((((((((((((((((((hg19:0.006550,panTro4:0.006840):0.002220, # gorGor3:0.008964):0.009693,ponAbe2:0.018940):0.003471, # nomLeu2:0.022270):0.012040,(rheMac3:0.007991, # papHam1:0.008042):0.029610):0.021830,(calJac3:0.030000, # saiBol1:0.040000):0.039650):0.052090,tarSyr1:0.111400):0.020520, # (micMur1:0.085600,otoGar3:0.119400):0.020520):0.015494, # tupBel1:0.186203):0.004937,(((((mm10:0.084509,rn5:0.091589):0.197773, # dipOrd1:0.211609):0.022992,(hetGla2:0.100000, # cavPor3:0.125629):0.100000):0.010150,speTri2:0.148468):0.025746, # (oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313):0.020593, # (((susScr3:0.120000,(vicPac1:0.087275,(turTru2:0.064688, # (oviAri1:0.100000,bosTau7:0.100000):0.023592):0.025153):0.020335):0.020000, # ((equCab2:0.109397,(felCat5:0.098612, # (canFam3:0.052458,ailMel1:0.050000):0.050000):0.049845):0.006219, # (myoLuc2:0.142540,pteVam1:0.113399):0.033706):0.004508):0.011671, # (eriEur1:0.221785,sorAra1:0.269562):0.056393):0.021227):0.023664, # ((((loxAfr3:0.082242,proCap1:0.155358):0.026990,echTel1:0.245936):0.010000, # triMan1:0.100000):0.049697,(dasNov3:0.116664, # choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686, # (sarHar1:0.100000,macEug2:0.072008):0.050000):0.215100):0.071664, # ornAna1:0.456592):0.109504,(((((melGal1:0.100000,galGal4:0.065536):0.100000, # taeGut1:0.171542):0.199223,melUnd1:0.100000):0.155143, # anoCar2:0.539241):0.122371,chrPic1:0.200000):0.010000):0.050000, # xenTro3:0.855573):0.100000,latCha1:0.855573):0.311354, # ((((((tetNig2:0.224159,fr3:0.203847):0.097590,oreNil2:0.200000):0.097590, # gasAcu1:0.316413):0.030000,oryLat2:0.511970):0.030000, # gadMor1:0.350000):0.225640,danRer7:0.730752):0.147949):0.526688, # petMar1:0.526688); # rearrange to get mm10 on top: cat << '_EOF_' > mm10.60way.nh (((((((((((((((mm10:0.084509,rn5:0.091589):0.197773,dipOrd1:0.211609):0.022992, (hetGla2:0.1,cavPor3:0.125629):0.1):0.01015,speTri2:0.148468):0.025746,(oryCun2:0.114227,ochPri2:0.201069):0.101463):0.015313, (((((((((hg19:0.00655,panTro4:0.00684):0.00222,gorGor3:0.008964):0.009693,ponAbe2:0.01894):0.003471, nomLeu2:0.02227):0.01204,(rheMac3:0.007991,papHam1:0.008042):0.02961):0.02183, (calJac3:0.03,saiBol1:0.04):0.03965):0.05209,tarSyr1:0.1114):0.02052,(micMur1:0.0856,otoGar3:0.1194):0.02052):0.015494, tupBel1:0.186203):0.004937):0.020593, ((susScr3:0.12,(vicPac1:0.087275,(turTru2:0.064688, (oviAri1:0.1,bosTau7:0.1):0.023592):0.025153):0.020335):0.01, ((((felCat5:0.098612, (canFam3:0.052458,ailMel1:0.05):0.05):0.049845,equCab2:0.109397):0.006219, (myoLuc2:0.14254,pteVam1:0.113399):0.033706):0.004508,(eriEur1:0.221785, sorAra1:0.269562):0.056393):0.021227):0.01):0.013664,((((loxAfr3:0.082242,proCap1:0.155358):0.02699, echTel1:0.245936):0.01,triMan1:0.1):0.049697,(dasNov3:0.116664, choHof1:0.096357):0.053145):0.006717):0.234728,(monDom5:0.125686,(sarHar1:0.1, macEug2:0.072008):0.05):0.2151):0.071664,ornAna1:0.456592):0.109504, (((((melGal1:0.1,galGal4:0.065536):0.1,taeGut1:0.171542):0.199223,melUnd1:0.1):0.155143,anoCar2:0.539241):0.122371, chrPic1:0.2):0.01):0.05,xenTro3:0.855573):0.1,latCha1:0.855573):0.311354, ((((((tetNig2:0.224159,fr3:0.203847):0.09759,oreNil2:0.2):0.09759,gasAcu1:0.316413):0.03, oryLat2:0.51197):0.03,gadMor1:0.35):0.22564,danRer7:0.730752):0.147949):0.526688,petMar1:0.526688); '_EOF_' # << happy emacs # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" mm10.60way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | sed -e 's/X__trop/X._trop/' > mm10.60way.commonNames.nh # (((((((((((((((Mouse:0.084509,Rat:0.091589):0.197773, # Kangaroo_rat:0.211609):0.022992,(Naked_mole:0.1, # Guinea_pig:0.125629):0.1):0.01015,Squirrel:0.148468):0.025746, # (Rabbit:0.114227,Pika:0.201069):0.101463):0.015313, # (((((((((Human:0.00655,Chimp:0.00684):0.00222,Gorilla:0.008964):0.009693, # Orangutan:0.01894):0.003471,Gibbon:0.02227):0.01204, # (Chinese_rhesus:0.007991,Baboon:0.008042):0.02961):0.02183, # (Marmoset:0.03,Squirrel_monkey:0.04):0.03965):0.05209, # Tarsier:0.1114):0.02052,(Mouse_lemur:0.0856, # Bushbaby:0.1194):0.02052):0.015494,Tree_shrew:0.186203):0.004937):0.020593, # ((Pig:0.12,(Alpaca:0.087275,(Dolphin:0.064688, # (Sheep:0.1,Cow:0.1):0.023592):0.025153):0.020335):0.01, # ((((Cat:0.098612,(Dog:0.052458,Panda:0.05):0.05):0.049845, # Horse:0.109397):0.006219,(Microbat:0.14254, # Megabat:0.113399):0.033706):0.004508,(Hedgehog:0.221785, # Shrew:0.269562):0.056393):0.021227):0.01):0.013664, # ((((Elephant:0.082242,Rock_hyrax:0.155358):0.02699, # Tenrec:0.245936):0.01,Manatee:0.1):0.049697, # (Armadillo:0.116664,Sloth:0.096357):0.053145):0.006717):0.234728, # (Opossum:0.125686,(Tasmanian_devil:0.1, # Wallaby:0.072008):0.05):0.2151):0.071664,Platypus:0.456592):0.109504, # (((((Turkey:0.1,Chicken:0.065536):0.1,Zebra_finch:0.171542):0.199223, # Budgerigar:0.1):0.155143,Lizard:0.539241):0.122371, # Painted_turtle:0.2):0.01):0.05,X._tropicalis:0.855573):0.1, # Coelacanth:0.855573):0.311354,((((((Tetraodon:0.224159, # Fugu:0.203847):0.09759,Nile_tilapia:0.2):0.09759, # Stickleback:0.316413):0.03,Medaka:0.51197):0.03, # Atlantic_cod:0.35):0.22564,Zebrafish:0.730752):0.147949):0.526688, # Lamprey:0.526688); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/mm10_60way.png /cluster/bin/phast/all_dists mm10.60way.nh | grep mm10 \ | sed -e "s/mm10^I//" | sort -k2n > 60way.distances.txt # Use this output to create the table below head 60way.distances.txt # rn5 0.176098 # speTri2 0.463892 # micMur1 0.483034 # dipOrd1 0.493891 # vicPac1 0.504686 # hetGla2 0.505274 # hg19 0.505328 # gorGor3 0.505522 # panTro4 0.505618 # nomLeu2 0.505664 cat << '_EOF_' > sizeStats.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<60way.distances.txt") or die "can not read 60way.distances.txt"; my $count = 0; while (my $line = <FH>) { chomp $line; my ($D, $dist) = split('\s+', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '{print \$5}' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm10/fb.${D}.chainMm10Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '{print \$5}' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='$D';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %02d %.4f (%% %06.3f) (%% %06.3f) - %s %s\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); '_EOF_' # << happy emacs chmod +x ./sizeStats.pl ./sizeStats.pl # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainAnoCar2Link # N distance on mm10 on other other species # 01 0.1761 (% 67.353) (% 70.278) - Rat rn5 # 02 0.4639 (% 34.217) (% 39.244) - Squirrel speTri2 # 03 0.4830 (% 26.636) (% 37.574) - Mouse lemur micMur1 # 04 0.4939 (% 19.460) (% 27.512) - Kangaroo rat dipOrd1 # 05 0.5047 (% 22.636) (% 31.769) - Alpaca vicPac1 # 06 0.5053 (% 32.753) (% 37.989) - Naked mole rat hetGla2 # 07 0.5053 (% 38.226) (% 35.249) - Human hg19 # 08 0.5055 (% 33.987) (% 34.349) - Gorilla gorGor3 # 09 0.5056 (% 34.674) (% 31.924) - Chimp panTro4 # 10 0.5057 (% 34.031) (% 32.274) - Gibbon nomLeu2 # 11 0.5058 (% 34.496) (% 30.610) - Orangutan ponAbe2 # 12 0.5073 (% 30.267) (% 33.492) - Dolphin turTru2 # 13 0.5088 (% 24.560) (% 24.986) - Tarsier tarSyr1 # 14 0.5090 (% 33.931) (% 33.464) - Chinese rhesus rheMac3 # 15 0.5090 (% 33.577) (% 32.023) - Baboon papHam1 # 16 0.5168 (% 29.795) (% 32.926) - Bushbaby otoGar3 # 17 0.5171 (% 25.685) (% 29.445) - Pig susScr3 # 18 0.5192 (% 32.450) (% 31.301) - Marmoset calJac3 # 19 0.5284 (% 34.415) (% 37.138) - Horse equCab2 # 20 0.5292 (% 32.339) (% 33.848) - Squirrel monkey saiBol1 # 21 0.5309 (% 28.447) (% 29.115) - Guinea pig cavPor3 # 22 0.5470 (% 18.019) (% 23.687) - Sloth choHof1 # 23 0.5472 (% 26.546) (% 24.649) - Manatee triMan1 # 24 0.5476 (% 19.766) (% 25.144) - Tree shrew tupBel1 # 25 0.5569 (% 25.248) (% 25.677) - Rabbit oryCun2 # 26 0.5599 (% 27.345) (% 38.627) - Megabat pteVam1 # 27 0.5662 (% 26.255) (% 25.383) - Cow bosTau7 # 28 0.5662 (% 15.341) (% 31.925) - Sheep oviAri1 # 29 0.5664 (% 25.823) (% 21.616) - Elephant loxAfr3 # 30 0.5673 (% 25.201) (% 21.066) - Armadillo dasNov3 # 31 0.5675 (% 29.725) (% 32.244) - Cat felCat5 # 32 0.5689 (% 30.979) (% 35.562) - Panda ailMel1 # 33 0.5713 (% 29.144) (% 31.624) - Dog canFam3 # 34 0.5891 (% 24.363) (% 33.650) - Microbat myoLuc2 # 35 0.6395 (% 15.147) (% 16.214) - Rock hyrax proCap1 # 36 0.6437 (% 14.542) (% 19.908) - Pika ochPri2 # 37 0.6865 (% 09.856) (% 12.264) - Hedgehog eriEur1 # 38 0.7031 (% 10.947) (% 14.117) - Tenrec echTel1 # 39 0.7343 (% 09.382) (% 13.569) - Shrew sorAra1 # 40 0.9626 (% 04.353) (% 04.448) - Wallaby macEug2 # 41 0.9663 (% 09.584) (% 07.205) - Opossum monDom5 # 42 0.9906 (% 08.479) (% 07.888) - Tasmanian devil sarHar1 # 43 1.0166 (% 04.731) (% 05.488) - Painted turtle chrPic1 # 44 1.1537 (% 05.348) (% 07.334) - Platypus ornAna1 # 45 1.1942 (% 03.589) (% 07.350) - Budgerigar melUnd1 # 46 1.4589 (% 03.676) (% 08.100) - Chicken galGal4 # 47 1.4649 (% 03.599) (% 07.304) - Zebra finch taeGut1 # 48 1.4782 (% 03.331) (% 04.988) - Lizard anoCar2 # 49 1.4934 (% 03.511) (% 08.211) - Turkey melGal1 # 50 1.7122 (% 03.125) (% 06.651) - X. tropicalis xenTro3 # 51 1.8122 (% 02.715) (% 03.380) - Coelacanth latCha1 # 52 1.9916 (% 01.726) (% 06.810) - Atlantic cod gadMor1 # 53 1.9992 (% 01.957) (% 06.091) - Nile tilapia oreNil2 # 54 2.0180 (% 02.016) (% 10.927) - Stickleback gasAcu1 # 55 2.1006 (% 01.789) (% 12.134) - Fugu fr3 # 56 2.1209 (% 01.735) (% 13.642) - Tetraodon tetNig2 # 57 2.1467 (% 02.602) (% 05.107) - Zebrafish danRer7 # 58 2.1835 (% 01.936) (% 06.561) - Medaka oryLat2 # 59 2.3214 (% 01.101) (% 03.159) - Lamprey petMar1 # None of this concern for distances matters in building the first step, the # maf files. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.60way.nh > tmp.nh echo `cat tmp.nh` > tree-commas.nh echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # bash shell syntax here ... cd /hive/data/genomes/mm10/bed/multiz60way export H=/hive/data/genomes/mm10/bed mkdir mafLinks for G in `sed -e "s/mm10 //" species.list` do mkdir mafLinks/$G if [ -s ${H}/lastz.${G}/mafRBestNet/chr1.maf.gz ]; then echo "$G - recipBest" ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafSynNet/chr1.maf.gz ]; then echo "$G - synNet" ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G else if [ -s ${H}/lastz.${G}/mafNet/chr1.maf.gz ]; then echo "$G - mafNet" ln -s ${H}/lastz.$G/mafNet/*.maf.gz ./mafLinks/$G else echo "missing directory lastz.${G}/*Net" fi fi fi done # verify the alignment type is correct: for D in `grep -v mm10 /hive/users/hiram/bigWays/mm10.60way/ordered.list` do ls -l mafLinks/$D/chr1.maf.gz | awk '{print $NF}' done # compare to the list at: # http://genomewiki.ucsc.edu/index.php/Mm10_Genome_size_statistics # need to split these things up into smaller pieces for # efficient kluster run. cd /hive/data/genomes/mm10/bed/multiz60way mkdir mafSplit cd mafSplit # mafSplitPos splits on gaps or repeat areas that will not have # any chains, approx 5 Mbp intervals, gaps at least 10,000 mafSplitPos -minGap=10000 mm10 5 stdout | sort -u \ | sort -k1,1 -k2,2n > mafSplit.bed # There is a splitRegions.pl script here (copied from previous hg19 46way) # that can create a custom track from this mafSplit.bed file. # Take a look at that in the browser and see if it looks OK, # check the number of sections on each chrom to verify none are # too large. Despite the claim above, it does appear that some # areas are split where actual chains exist. ./splitRegions.pl mafSplit.bed > splitRegions.ct # to see the sizes of the regions: grep "^chr" splitRegions.ct | awk '{print $3-$2,$0}' | sort -rn | less # run a kluster job to split them all ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/mafSplit cat << '_EOF_' > runOne #!/bin/csh -ef set G = $1 set C = $2 mkdir -p $G pushd $G > /dev/null if ( -s ../../mafLinks/${G}/${C}.maf.gz ) then if ( -s mm10_${C}.00.maf ) then /bin/rm -f mm10_${C}.*.maf endif /cluster/bin/x86_64/mafSplit ../mafSplit.bed mm10_ ../../mafLinks/${G}/${C}.maf.gz /bin/gzip mm10_${C}.*.maf else /bin/touch mm10_${C}.00.maf /bin/gzip mm10_${C}.00.maf endif popd > /dev/null '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ $(root1)/mm10_$(root2).00.maf.gz} #ENDLOOP '_EOF_' # << happy emacs for G in `sed -e "s/mm10 //" ../species.list` do echo $G done > species.list cut -f 1 ../../../chrom.sizes > chr.list gensub2 species.list chr.list template jobList para -ram=8g create jobList para try ... check ... push ... etc... # Completed: 3894 of 3894 jobs # CPU time in finished jobs: 18929s 315.49m 5.26h 0.22d 0.001 y # IO & Wait Time: 62908s 1048.46m 17.47h 0.73d 0.002 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 346s 5.77m 0.10h 0.00d # Submission to last job: 471s 7.85m 0.13h 0.01d # construct a list of all possible maf file names. # they do not all exist in each of the species directories find . -type f | grep "maf.gz" | wc -l # 19733 find . -type f | grep ".maf.gz$" | xargs -L 1 basename | sort -u > maf.list wc -l maf.list # 336 maf.list mkdir /hive/data/genomes/mm10/bed/multiz60way/splitRun cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' #!/bin/csh -ef set db = mm10 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /scratch/tmp/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/multiz60way/mafSplit /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz.csh cat << '_EOF_' > template #LOOP ./autoMultiz.csh $(root1) {check out line+ /hive/data/genomes/mm10/bed/multiz60way/splitRun/maf/$(root1)} #ENDLOOP '_EOF_' # << happy emacs ln -s ../../mafSplit/maf.list maf.list ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/splitRun/run # the tac reverses the list to get the small jobs first gensub2 maf.list single template stdout | tac > jobList para -ram=8g create jobList # Completed: 336 of 336 jobs # CPU time in finished jobs: 2828651s 47144.19m 785.74h 32.74d 0.090 y # IO & Wait Time: 200533s 3342.21m 55.70h 2.32d 0.006 y # Average job time: 9015s 150.26m 2.50h 0.10d # Longest finished job: 47029s 783.82m 13.06h 0.54d # Submission to last job: 48982s 816.37m 13.61h 0.57d # put the split maf results back together into a single maf file # eliminate duplicate comments ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/splitRun mkdir ../maf # the sed edits take out partitioning name information from the comments # so the multiple parts will condense to smaller number of lines # this takes almost 2 hours of time, resulting in a bit over 150 Gb, # almost all chrom files over 1 Gb, up to almost 10 Gb for chr2 # HOWEVER, this is actually not necessary to maintain these comments, # they are lost during the mafAddIRows cat << '_EOF_' >> runOne #!/bin/csh -fe set C = $1 if ( -s ../maf/${C}.maf.gz ) then rm -f ../maf/${C}.maf.gz endif head -q -n 1 maf/mm10_${C}.*.maf | sort -u > ../maf/${C}.maf grep -h "^#" maf/mm10_${C}.*.maf | egrep -v "maf version=1|eof maf" | \ sed -e "s#${C}.[0-9][0-9]*#${C}#g; s#_MZ_[^ ]* # #g;" \ | sort -u >> ../maf/${C}.maf grep -h -v "^#" `ls maf/mm10_${C}.*.maf | sort -t. -k2,2n` >> ../maf/${C}.maf tail -q -n 1 maf/mm10_${C}.*.maf | sort -u >> ../maf/${C}.maf '_EOF_' # << happy emacs chmod +x runOne cat << '_EOF_' >> template #LOOP runOne $(root1) {check out exists+ ../maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 ../../../chrom.sizes > chr.list ssh encodek cd /hive/data/genomes/mm10/bed/multiz60way/splitRun gensub2 chr.list single template jobList para -ram=8g create jobList para try ... check ... push ... etc ... # Completed: 62 of 66 jobs # Crashed: 4 jobs # CPU time in finished jobs: 461s 7.68m 0.13h 0.01d 0.000 y # IO & Wait Time: 17863s 297.72m 4.96h 0.21d 0.001 y # Average job time: 296s 4.93m 0.08h 0.00d # Longest finished job: 1144s 19.07m 0.32h 0.01d # Submission to last job: 1156s 19.27m 0.32h 0.01d # these four have empty results: # chrUn_GL456383 # chrUn_GL456389 # chrUn_GL456390 # chrUn_GL456396 # Load into database ssh hgwdev mkdir -p /gbdb/mm10/multiz60way cd /hive/data/genomes/mm10/bed/multiz60way/maf ln -s `pwd`/*.maf /gbdb/mm10/multiz60way # this generates an immense multiz60way.tab file in the directory # where it is running. Best to run this over in scratch. # This is going to take all day. cd /scratch/tmp time nice -n +19 hgLoadMaf mm10 multiz60way # Loaded 56185270 mafs in 66 files from /gbdb/mm10/multiz60way # real 72m45.513s # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab time cat /gbdb/mm10/multiz60way/*.maf \ | nice -n +19 hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin # Created 12012784 summary blocks from 1074134156 components and # 56185270 mafs from stdin # real 104m2.107s wc -l multiz60way*.tab # 56185270 multiz60way.tab # 12012784 multiz60waySummary.tab # 68198054 total # -rw-rw-r-- 1 2857704841 Apr 18 10:49 multiz60way.tab # -rw-rw-r-- 1 567210414 Apr 18 17:28 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # GAP ANNOTATE MULTIZ9WAY MAF AND LOAD TABLES (DONE - 2012-05-31 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. mkdir -p /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno cd /hive/data/genomes/mm10/bed/multiz60way/anno # check for N.bed files everywhere: for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" cd /hive/data/genomes/${DB} twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/mm10/bed/multiz60way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S mm10 # use a screen to control this longish job ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/anno mkdir result # NEXT TIME: this template should have a check out exists+ statement cat << '_EOF_' > template #LOOP mafAddIRows -nBeds=nBeds $(path1) /hive/data/genomes/mm10/mm10.2bit {check out line+ result/$(file1)} #ENDLOOP '_EOF_' # << happy emacs ls ../maf/*.maf > maf.list # the tac puts the short jobs first gensub2 maf.list single template stdout | tac > jobList # limit jobs to one per node with the ram=8g requirement para -ram=8g create jobList para try ... check ... push ... # Completed: 46 of 66 jobs # CPU time in finished jobs: 350s 5.83m 0.10h 0.00d 0.000 y # IO & Wait Time: 603s 10.06m 0.17h 0.01d 0.000 y # Average job time: 21s 0.35m 0.01h 0.00d # Longest finished job: 54s 0.90m 0.01h 0.00d # Submission to last job: 113s 1.88m 0.03h 0.00d # a number of these jobs did not finish due to memory limitations. # The jobs would sit on the nodes appearing to occupy 8 Gb of memory, # but did not see any swapping or CPU time accumulation. Stop the # batch and run the rest manually on hgwdev: #!/bin/sh export maxMem=188743680 ulimit -S -m $maxMem -v $maxMem mafAddIRows -nBeds=nBeds ../maf/chrX.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chrX.maf & mafAddIRows -nBeds=nBeds ../maf/chr9.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr9.maf & mafAddIRows -nBeds=nBeds ../maf/chr8.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr8.maf & mafAddIRows -nBeds=nBeds ../maf/chr7.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr7.maf & wait mafAddIRows -nBeds=nBeds ../maf/chr6.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr6.maf & mafAddIRows -nBeds=nBeds ../maf/chr5.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr5.maf & mafAddIRows -nBeds=nBeds ../maf/chr4.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr4.maf & mafAddIRows -nBeds=nBeds ../maf/chr3.maf /hive/data/genomes/mm10/mm10.2bit hgwdev/chr3.maf & wait ... etc ... # the run time for those 20 jobs: # real 159m49.217s # verify all result files have some content, look for 0 size files: find . -type f -size 0 # should see none # combine into one file (realized after this, that we do *not* need # this single file. Individual files are OK. head -q -n 1 result/chrM.maf > mm10.60way.maf time for F in hgwdev/*.maf result/*.maf do grep -h -v "^#" ${F} done >> mm10.60way.maf # real 1082m47.484s -> 18 hours ! # -rw-rw-r-- 1 261567878241 Jun 8 10:30 mm10.60way.maf du -hsc mm10.60way.maf # 244G mm10.60way.maf # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/chrM.maf >> mm10.60way.maf # How about an official end marker: echo "##eof maf" >> mm10.60way.maf # construct symlinks to get the individual maf files into gbdb: mkdir /gbdb/mm10/multiz60way/maf ln -s `pwd`/result/*.maf `pwd`/hgwdev/*.maf /gbdb/mm10/multiz60way/maf/ # Load into database rm /gbdb/mm10/multiz60way/*.maf # remove previous results cd /scratch/tmp time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm10/multiz60way/maf \ mm10 multiz60way # Loaded 58087742 mafs in 66 files from /gbdb/mm10/multiz60way/maf # real 868m28.108s time (cat /gbdb/mm10/multiz60way/maf/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 multiz60waySummary stdin) # -rw-rw-r-- 1 3009209972 Jun 9 03:23 multiz60way.tab # -rw-rw-r-- 1 591235982 Jun 11 18:34 multiz60waySummary.tab rm multiz60way*.tab ####################################################################### # MULTIZ60WAY MAF FRAMES (DONE - 2012-05-30 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/frames cd /hive/data/genomes/mm10/bed/multiz60way/frames # survey all the genomes to find out what kinds of gene tracks they have cat << '_EOF_' > showGenes.csh #!/bin/csh -fe foreach db (`cat ../species.list`) echo -n "${db}: " set tables = `hgsql $db -N -e "show tables like '%Gene%'"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "mgcGenes" || $table == "knownGene" || \ $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='$db'"` set orgId = `hgsql hg19 -N -e \ "select id from organism where name='$orgName'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql hg19 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end '_EOF_' # << happy emacs chmod +x ./showGenes.csh time ./showGenes.csh > showGenes.txt # real 9m11.678s # rearrange that output to create four sections, and place these names # in .list files here: # 1. knownGene: hg19 # 2. refGene: bosTau7 danRer7 galGal4 mm10 rheMac3 rn5 susScr3 xenTro3 # 3. ensGene: ailMel1 anoCar2 calJac3 cavPor3 choHof1 dipOrd1 echTel1 # equCab2 eriEur1 fr3 gasAcu1 gorGor3 loxAfr3 melGal1 # micMur1 monDom5 myoLuc2 ochPri2 ornAna1 oryCun2 oryLat2 # panTro4 ponAbe2 proCap1 pteVam1 sorAra1 taeGut1 tarSyr1 # tetNig2 tupBel1 vicPac1 # 4. xenoRefGene: canFam3 chrPic1 dasNov3 felCat5 hetGla2 latCha1 macEug2 # nomLeu2 otoGar3 oviAri1 papHam1 petMar1 saiBol1 sarHar1 # triMan1 # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 mkdir genes # 1. knownGene: hg19 hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" hg19 \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/hg19.gp.gz # 2. refGene, want the full extended genePred: for DB in `cat refGene.list` do hgsql -N -e "select * from refGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 3. ensGene, want the full extended genePred: for DB in `cat ensGene.list` do hgsql -N -e "select * from ensGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 4. xenoRefGene, want the full extended genePred: for DB in `cat xenoRG.list` do hgsql -N -e "select * from xenoRefGene" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # 5. genscan: gadMor1 melUnd1 oreNil2 speTri2 turTru2 # this was done in error the first time, mistakenly using # the xenoRefGene table instead of genscan for DB in `cat genscan.list` do hgsql -N -e "select * from genscan" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /scratch/tmp/${DB}.tmp.gz mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz echo "${DB} done" done # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/ailMel1.gp.gz: 19204 # genes/anoCar2.gp.gz: 17766 # genes/bosTau7.gp.gz: 12958 # genes/calJac3.gp.gz: 20843 # genes/canFam3.gp.gz: 20652 # genes/cavPor3.gp.gz: 18631 # genes/choHof1.gp.gz: 12403 # genes/chrPic1.gp.gz: 19433 # genes/danRer7.gp.gz: 13902 # genes/dasNov3.gp.gz: 29551 # genes/dipOrd1.gp.gz: 15784 # genes/echTel1.gp.gz: 16499 # genes/equCab2.gp.gz: 20403 # genes/eriEur1.gp.gz: 11712 # genes/felCat5.gp.gz: 19512 # genes/fr3.gp.gz: 18014 # genes/gadMor1.gp.gz: 27572 # genes/galGal4.gp.gz: 4892 # genes/gasAcu1.gp.gz: 20631 # genes/gorGor3.gp.gz: 20759 # genes/hetGla2.gp.gz: 25749 # genes/hg19.gp.gz: 20718 # genes/latCha1.gp.gz: 18786 # genes/loxAfr3.gp.gz: 19986 # genes/macEug2.gp.gz: 26006 # genes/melGal1.gp.gz: 14050 # genes/melUnd1.gp.gz: 15296 # genes/micMur1.gp.gz: 16240 # genes/mm10.gp.gz: 20985 # genes/monDom5.gp.gz: 19188 # genes/myoLuc2.gp.gz: 19685 # genes/nomLeu2.gp.gz: 22996 # genes/ochPri2.gp.gz: 15970 # genes/oreNil2.gp.gz: 18636 # genes/ornAna1.gp.gz: 17728 # genes/oryCun2.gp.gz: 18921 # genes/oryLat2.gp.gz: 19576 # genes/otoGar3.gp.gz: 24061 # genes/oviAri1.gp.gz: 17890 # genes/panTro4.gp.gz: 18647 # genes/papHam1.gp.gz: 27842 # genes/petMar1.gp.gz: 11089 # genes/ponAbe2.gp.gz: 19895 # genes/proCap1.gp.gz: 16043 # genes/pteVam1.gp.gz: 16966 # genes/rheMac3.gp.gz: 5580 # genes/rn5.gp.gz: 16393 # genes/saiBol1.gp.gz: 23419 # genes/sarHar1.gp.gz: 20694 # genes/sorAra1.gp.gz: 13156 # genes/speTri2.gp.gz: 22377 # genes/susScr3.gp.gz: 3771 # genes/taeGut1.gp.gz: 17354 # genes/tarSyr1.gp.gz: 13615 # genes/tetNig2.gp.gz: 19539 # genes/triMan1.gp.gz: 19514 # genes/tupBel1.gp.gz: 15407 # genes/turTru2.gp.gz: 28375 # genes/vicPac1.gp.gz: 11754 # genes/xenTro3.gp.gz: 8447 # kluster job to annotate each maf file screen -S mm10 # manage long running procedure with screen ssh swarm cd /hive/data/genomes/mm10/bed/multiz60way/frames cat << '_EOF_' > runOne #!/bin/csh -fe set C = $1 set G = $2 cat ../maf/${C}.maf | genePredToMafFrames mm10 stdin stdout \ ${G} genes/${G}.gp.gz | gzip > parts/${C}.${G}.mafFrames.gz '_EOF_' # << happy emacs chmod +x runOne # older instructions excluded mm10 from the gene.list # this was a mistake. mm10 can be annotated too. # Mistakenly did this the first run through, had to manually # do the mm10 genes separately on hgwdev after this was done ls ../maf | sed -e "s/.maf//" > chr.list ls genes | sed -e "s/.gp.gz//" > gene.list cat << '_EOF_' > template #LOOP runOne $(root1) $(root2) {check out exists+ parts/$(root1).$(root2).mafFrames.gz} #ENDLOOP '_EOF_' # << happy emacs mkdir parts gensub2 chr.list gene.list template jobList para -ram=8g create jobList para try ... check ... push # Completed: 3960 of 3960 jobs # CPU time in finished jobs: 85610s 1426.83m 23.78h 0.99d 0.003 y # IO & Wait Time: 2030956s 33849.27m 564.15h 23.51d 0.064 y # Average job time: 534s 8.91m 0.15h 0.01d # Longest finished job: 3877s 64.62m 1.08h 0.04d # Submission to last job: 12974s 216.23m 3.60h 0.15d # collect all results into one file: cd /hive/data/genomes/mm10/bed/multiz60way/frames find ./parts -type f | while read F do zcat ${F} done | sort -k1,1 -k2,2n > multiz60wayFrames.bed # -rw-rw-r-- 1 1164299719 May 30 11:28 multiz60wayFrames.bed # verify there are frames on everything: cut -f4 multiz60wayFrames.bed | sort | uniq -c | sort -n \ > annotation.survey.txt # should be 60 species: wc -l annotation.survey.txt # 60 annotation.survey.txt # and the minimum numbers: head annotation.survey.txt # 43900 susScr3 # 59839 rheMac3 # 153246 petMar1 # 162501 choHof1 # ... etc ... # load the resulting file ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/frames time gzip multiz60wayFrames.bed # real 0m51.826s # reloading this table 2012-10-11 with more accurate frames: time hgLoadMafFrames mm10 multiz60wayFrames multiz60wayFrames.bed.gz # real 3m2.449s time featureBits -countGaps mm10 multiz60wayFrames # 57707702 bases of 2730871774 (2.113%) in intersection # real 1m45.141s # reload table to fix frames problems 2014-03-19 - Hiram time featureBits -countGaps mm10 multiz60wayFrames # 79955378 bases of 2730871774 (2.928%) in intersection # enable the trackDb entries: # frames multiz60wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 60-way (DONE - 2012-05-31 - 2012-06-12 - Hiram) mkdir /hive/data/genomes/mm10/bed/multiz60way/4d cd /hive/data/genomes/mm10/bed/multiz60way/4d # the annotated maf's are in: ../anno/result/*.maf # using ensGene for mm10, only transcribed genes and nothing # from the randoms and other misc. hgsql mm10 -Ne \ "select * from ensGene WHERE cdsEnd > cdsStart;" | cut -f 2-20 \ | egrep -E -v "chrM|chrUn|random|_hap" > ensGene.gp wc -l *.gp # 55423 ensGene.gp genePredSingleCover ensGene.gp stdout | sort > ensGeneNR.gp wc -l ensGeneNR.gp # 22457 ensGeneNR.gp ssh encodek mkdir /hive/data/genomes/mm10/bed/multiz60way/4d/run cd /hive/data/genomes/mm10/bed/multiz60way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name cat << '_EOF_' > 4d.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set r = "/hive/data/genomes/mm10/bed/multiz60way" set c = $1 set infile = $r/anno/result/$2 set outfile = $3 cd /scratch/tmp # 'clean' maf perl -wpe 's/^s ([^.]+)\.\S+/s $1/' $infile > $c.maf awk -v C=$c '$2 == C {print}' $r/4d/ensGeneNR.gp | sed -e "s/\t$c\t/\tmm10.$c\t/" > $c.gp set NL=`wc -l $c.gp| gawk '{print $1}'` if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $c.maf -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $r/4d/run/$outfile else echo "" > $r/4d/run/$outfile endif rm -f $c.gp $c.maf $c.ss '_EOF_' # << happy emacs chmod +x 4d.csh ls -1S /hive/data/genomes/mm10/bed/multiz60way/anno/result/*.maf \ | sed -e "s#.*multiz60way/anno/result/##" \ > maf.list cat << '_EOF_' > template #LOOP 4d.csh $(root1) $(path1) {check out line+ ../mfa/$(root1).mfa} #ENDLOOP '_EOF_' # << happy emacs # the tac puts the quick jobs at the front gensub2 maf.list single template stdout | tac > jobList para create jobList para try ... check para -maxJob=5 push para time # Completed: 66 of 66 jobs # CPU time in finished jobs: 13176s 219.60m 3.66h 0.15d 0.000 y # IO & Wait Time: 31790s 529.84m 8.83h 0.37d 0.001 y # Average job time: 681s 11.36m 0.19h 0.01d # Longest finished job: 2883s 48.05m 0.80h 0.03d # Submission to last job: 2925s 48.75m 0.81h 0.03d # combine mfa files ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # most interesting, this did not identify files of size 1: # find ./mfa -type f -size 1 ls -og mfa | awk '$3 == 1' | awk '{print $NF}' > empty.list sed -e "s#^#mfa/##" empty.list | xargs rm -f #want comma-less species.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 60 # use phyloFit to create tree model (output is phyloFit.mod) time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ../tree-commas.nh 4d.all.mfa # real 98m59.203s mv phyloFit.mod all.mod grep TREE all.mod #TREE: (((((((((((((((mm10:0.0855383,rn5:0.0922719):0.202381,dipOrd1:0.210819):0.0258471,(hetGla2:0.0917322,cavPor3:0.136876):0.0994271):0.00910944,speTri2:0.145483):0.0274969,(oryCun2:0.109639,ochPri2:0.200966):0.102067):0.0141654,(((((((((hg19:0.00674057,panTro4:0.00692231):0.00309904,gorGor3:0.00918625):0.00954082,ponAbe2:0.0191843):0.00356049,nomLeu2:0.0218207):0.0116848,(rheMac3:0.00814945,papHam1:0.0079848):0.0289473):0.0208338,(calJac3:0.0342405,saiBol1:0.0333221):0.0359171):0.0594469,tarSyr1:0.137467):0.011091,(micMur1:0.0918138,otoGar3:0.127231):0.0351527):0.0153171,tupBel1:0.18879):0.0042463):0.0214646,((susScr3:0.121641,(vicPac1:0.109818,(turTru2:0.0635753,(oviAri1:0.0392493,bosTau7:0.0315816):0.0939861):0.0203711):0.00368417):0.0444758,((((felCat5:0.0897448,(canFam3:0.0888602,ailMel1:0.0767935):0.021837):0.05011,equCab2:0.109367):0.00605998,(myoLuc2:0.137144,pteVam1:0.114013):0.0339604):0.00395001,(eriEur1:0.226934,sorAra1:0.270619):0.0628319):0.00292667):0.0291403):0.0231397,((((loxAfr3:0.078841,proCap1:0.160295):0.00825096,echTel1:0.266786):0.0031636,triMan1:0.0685675):0.0736043,(dasNov3:0.112086,choHof1:0.0974658):0.0535724):0.00739115):0.245967,(monDom5:0.139913,(sarHar1:0.132596,macEug2:0.111778):0.0294309):0.21273):0.0770867,ornAna1:0.50425):0.135096,(((((melGal1:0.067697,galGal4:0.05253):0.13729,taeGut1:0.202681):0.00899388,melUnd1:0.127774):0.216078,anoCar2:0.575186):0.0128221,chrPic1:0.201659):0.137011):0.113527,xenTro3:0.943162):0.0646458,latCha1:0.596956):0.463611,((((((tetNig2:0.223213,fr3:0.198755):0.263107,oreNil2:0.33649):0.0139699,gasAcu1:0.314841):0.0573697,oryLat2:0.430105):0.185668,gadMor1:0.562778):0.169352,danRer7:0.753326):0.117017):0.501088,petMar1:0.501088); # four different subset lists: paste glire.list euarchontoglires.list placental.list all.list # mm10 mm10 mm10 mm10 # rn5 rn5 rn5 rn5 # dipOrd1 dipOrd1 dipOrd1 dipOrd1 # hetGla2 hetGla2 hetGla2 hetGla2 # cavPor3 cavPor3 cavPor3 cavPor3 # speTri2 speTri2 speTri2 speTri2 # oryCun2 oryCun2 oryCun2 oryCun2 # ochPri2 ochPri2 ochPri2 ochPri2 # tupBel1 tupBel1 tupBel1 # hg19 hg19 hg19 # gorGor3 gorGor3 gorGor3 # panTro4 panTro4 panTro4 # nomLeu2 nomLeu2 nomLeu2 # ponAbe2 ponAbe2 ponAbe2 # tarSyr1 tarSyr1 tarSyr1 # rheMac3 rheMac3 rheMac3 # papHam1 papHam1 papHam1 # otoGar3 otoGar3 otoGar3 # calJac3 calJac3 calJac3 # micMur1 micMur1 micMur1 # saiBol1 saiBol1 saiBol1 # equCab2 equCab2 # vicPac1 vicPac1 # turTru2 turTru2 # susScr3 susScr3 # bosTau7 bosTau7 # oviAri1 oviAri1 # pteVam1 pteVam1 # myoLuc2 myoLuc2 # felCat5 felCat5 # canFam3 canFam3 # ailMel1 ailMel1 # eriEur1 eriEur1 # sorAra1 sorAra1 # choHof1 choHof1 # dasNov3 dasNov3 # proCap1 proCap1 # echTel1 echTel1 # triMan1 triMan1 # loxAfr3 loxAfr3 # macEug2 # sarHar1 # monDom5 # ornAna1 # galGal4 # taeGut1 # melGal1 # melUnd1 # anoCar2 # chrPic1 # xenTro3 # latCha1 # gadMor1 # gasAcu1 # fr3 # oreNil2 # tetNig2 # danRer7 # oryLat2 # petMar1 # on organisms that do not have all species in all files, the file names # need to be filtered. Using this perl script to extract from # the full mfa files, only the subset of species from the four lists: cat << '_EOF_' > filterMfa.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc != 1) { printf STDERR "usage: filterMfa.pl <subset.list>\n"; exit 255; } my %dbList; my $file = shift; open (FH, "<$file") or die "can not read $file"; printf STDERR "using list: $file\n"; while (my $db = <FH>) { chomp $db; $dbList{$db} = 1; } close (FH); my $dirName = $file; $dirName =~ s/.list//; $dirName .= "Mfa"; my @mfaFileList = split('\n', `ls mfa/*.mfa`); for (my $i = 0; $i < scalar(@mfaFileList); ++$i) { my $file = $mfaFileList[$i]; my $chr = $file; $chr =~ s#^mfa/##; # printf STDERR "processing: %s into %s/%s\n", $file, $dirName, $chr; open (FH, "<$file") or die "can not read $file"; open (OF, ">$dirName/$chr") or die "can not write to $dirName/$chr"; my $inGroup = 0; while (my $line = <FH>) { if ($line =~ m/^> /) { chomp $line; my ($faHead, $faDbName) = split('\s+', $line); if (exists($dbList{$faDbName})) { $inGroup = 1; printf OF "> %s\n", $faDbName; } else { $inGroup = 0; } } elsif ($inGroup) { printf OF "%s", $line; } } close (FH); close (OF); } '_EOF_' # << happy emacs chmod +x filterMfa.pl mkdir glireMfa euarchontogliresMfa placentalMfa vertebrateMfa # extract each set from the full mfa files, run msa_view on # each subset and construct .nh tree for that subset for N in glire euarchontoglires placental vertebrate do ./filterMfa.pl ${N}.list /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ${N}.list|xargs echo`" ${N}Mfa/*.mfa \ | sed s/"> "/">"/ > 4d.${N}.mfa /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/tree_doctor \ --no-branchlen --prune-all-but="`cat ${N}.list|xargs echo`" \ ../tree-commas.nh > tree-commas.${N}.nh done ### XXX ### MOST INTERESTING, this phyloFit operation was repeated ### to verify that the full 60 species vertebrate operation produced the ### same result as the original "all" subset. This phyloFit appears to ### produce a different result each time ? # use phyloFit to create tree model (output is phyloFit.mod) for N in glire euarchontoglires placental vertebrate do time nice -n +19 \ /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree ./tree-commas.${N}.nh 4d.${N}.mfa mv phyloFit.mod ${N}.mod grep TREE ${N}.mod | sed 's/TREE\:\ //' > ${N}.Nway.nh done # real 0m15.747s # real 4m5.526s # real 20m45.982s # real 141m21.248s ####################################################################### # phastCons 60-way (DONE - 2012-06-12, 2012-08-21 - Hiram) # was unable to split the full chrom MAF files, now working on the # maf files as they were split up during multiz # split 60way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh encodek mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/ss mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cd /hive/data/genomes/mm10/bed/multiz60way/cons/msa.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/cons/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs chmod +x doSplit.csh cat << '_EOF_' > template #LOOP doSplit.csh $(root1) {check out line+ $(root1).done} #ENDLOOP '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list gensub2 maf.list single template jobList para -ram=8g create jobList para try ... check ... etc # Completed: 64 of 66 jobs # Crashed: 2 jobs # CPU time in finished jobs: 347730s 5795.49m 96.59h 4.02d 0.011 y # IO & Wait Time: 102813s 1713.56m 28.56h 1.19d 0.003 y # Average job time: 7040s 117.33m 1.96h 0.08d # Longest finished job: 42666s 711.10m 11.85h 0.49d # Submission to last job: 150336s 2505.60m 41.76h 1.74d # finish the last two on hgwdev with more memory. # linux data memory, in 1024-byte units export M=188743680 ulimit -S -m $M -v $M ./doSplit.csh chr1 chr1.done & ./doSplit.csh chr2 chr2.done wait # real 864m53.235s # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh swarm mkdir -p /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons cd /hive/data/genomes/mm10/bed/multiz60way/cons/run.cons # there are going to be several different phastCons runs using # this same script. They trigger off of the current working directory # $cwd:t which is the "grp" in this script. It is one of: # all glire glirePrimate glirePrimatePlacental cat << '_EOF_' > doPhast.csh #!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set f = $2 set len = $3 set cov = $4 set rho = $5 set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/cons set tmp = $cons/tmp/$f mkdir -p $tmp set ssSrc = $cons/ss set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$c/$f.ss $tmp else ln -s $ssSrc/$c/$f.ss $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --not-informative `cat $grp.non-inf` \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp else $PHASTBIN/phastCons $f.ss $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \ --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp endif popd > /dev/null mkdir -p pp/$c bed/$c sleep 4 touch pp/$c bed/$c rm -f pp/$c/$f.pp rm -f bed/$c/$f.bed mv $tmp/$f.pp pp/$c mv $tmp/$f.bed bed/$c rm -fr $tmp '_EOF_' # << happy emacs chmod a+x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.cons/doPhast.csh $(root1) $(file1) 45 0.3 0.3 {check out line+ pp/$(root1)/$(file1).pp} #ENDLOOP '_EOF_' # << happy emacs ls -1S ../ss/chr*/chr* | sed -e "s/.ss$//" > ss.list # Create parasol batch and run it ############################ run for all species cd /hive/data/genomes/mm10/bed/multiz60way/cons mkdir all cd all cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 36286s 604.77m 10.08h 0.42d 0.001 y # IO & Wait Time: 10101s 168.35m 2.81h 0.12d 0.000 y # Average job time: 148s 2.46m 0.04h 0.00d # Longest finished job: 219s 3.65m 0.06h 0.00d # Submission to last job: 4383s 73.05m 1.22h 0.05d # create Most Conserved track cd /hive/data/genomes/mm10/bed/multiz60way/cons/all cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # -rw-rw-r-- 1 230642249 Jun 15 11:48 tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # -rw-rw-r-- 1 236425914 Jun 15 11:52 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgLoadBed mm10 phastConsElements60way mostConserved.bed # Read 6748481 elements of size 5 from mostConserved.bed # real 2m20.950s # Try for 5% overall cov, and 70% CDS cov featureBits mm10 -enrichment refGene:cds phastConsElements60way # --rho 0.3 --expected-length 45 --target-coverage 0.3 # refGene:cds 1.281%, phastConsElements60way 6.517%, # both 0.913%, cover 71.29%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60way # ensGene:cds 1.357%, phastConsElements60way 6.517%, both 0.942%, cover # 69.39%, enrich 10.65x # real 0m54.109s time featureBits mm10 -enrichment knownGene:cds phastConsElements60way # knownGene:cds 1.325%, phastConsElements60way 6.517%, both 0.930%, # cover 70.18%, enrich 10.77x # real 0m50.472s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/mm10/bed/multiz60way/cons/all mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.wigFix.gz done # real 102m58.496s # encode those files into wiggle data time (zcat downloads/*.wigFix.gz \ | wigEncode stdin phastCons60way.wig phastCons60way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m32.980s du -hsc *.wi? # 1.8G phastCons60way.wib # 298M phastCons60way.wig # 2.1G total # encode into a bigWig file: # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit, set 180 Gb here: sizeG=188743680 export sizeG ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60way.bw) # real 27m1.039s # -rw-rw-r-- 1 4671685725 Jun 18 10:24 phastCons60way.bw bigWigInfo phastCons60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,510,917 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149660 min: 0.000000 max: 1.000000 std: 0.282516 # if you wanted to use the bigWig file, loading bigWig table: # but we don't use the bigWig file mkdir /gbdb/mm10/bbi ln -s `pwd`/phastCons60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60way; \ create table phastCons60way (fileName varchar(255) not null); \ insert into phastCons60way values ("/gbdb/mm10/bbi/phastCons60way.bw");' # Load gbdb and database with wiggle. ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all ln -s `pwd`/phastCons60way.wib /gbdb/mm10/multiz60way/phastCons60way.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60way phastCons60way.wig # real 0m54.546s wigTableStats.sh mm10 phastCons60way # db.table min max mean count sumData # mm10.phastCons60way 0 1 0.14966 1929686275 2.88797e+08 # stdDev viewLimits # 0.282516 viewLimits=0:1 # Create histogram to get an overview of all the data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/all time nice -n +19 hgWiggle -doHistogram -db=mm10 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ phastCons60way > histogram.data 2>&1 # real 7m37.212s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60way track" set xlabel " phastCons60way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Glires # setup glire-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/glire cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # glire-only: get the glire only tree from the 4d directory cp -p ../../4d/glire.mod ./glire.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/glire.list > glire.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 glire.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > glire.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 12411s 206.85m 3.45h 0.14d 0.000 y # IO & Wait Time: 117850s 1964.16m 32.74h 1.36d 0.004 y # Average job time: 415s 6.91m 0.12h 0.00d # Longest finished job: 658s 10.97m 0.18h 0.01d # Submission to last job: 796s 13.27m 0.22h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire time nice -n +19 hgLoadBed mm10 phastConsElements60wayGlire \ mostConserved.bed # Loaded 1336504 elements of size 6 # real 0m13.672s # verify coverage time featureBits mm10 phastConsElements60wayGlire # 117058023 bases of 2652783500 (4.413%) in intersection # real 0m15.041s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayGlire # refGene:cds 1.282%, phastConsElements60wayGlire 4.413%, # both 0.944%, cover 73.60%, enrich 16.68x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayGlire # knownGene:cds 1.325%, phastConsElements60wayGlire 4.413%, # both 0.957%, cover 72.22%, enrich 16.37x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayGlire.wig phastCons60wayGlire.wib) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 10m26.712s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig -verbose=2 stdin ../../../../chrom.sizes \ phastCons60wayGlire.bw > bigWig.log 2>&1) & # real 52m17.108s grep VmPeak bigWig.log # pid=5552: VmPeak: 20926360 kB bigWigInfo phastCons60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,631,413,425 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.142675 min: 0.000000 max: 1.000000 std: 0.252347 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayGlire; \ create table phastCons60wayGlire \ (fileName varchar(255) not null); \ insert into phastCons60wayGlire values ("/gbdb/mm10/bbi/phastCons60wayGlire.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/glire ln -s `pwd`/phastCons60wayGlire.wib \ /gbdb/mm10/multiz60way/phastCons60wayGlire.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayGlire phastCons60wayGlire.wig # real 0m56.786s wigTableStats.sh mm10 phastCons60wayGlire # db.table min max mean count sumData mm10.phastCons60wayGlire 0 1 0.142675 1929686275 2.75318e+08 # stdDev viewLimits # 0.252347 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayGlire > histogram.data 2>&1 # real 4m28.743s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Hg19 Histogram phastCons60wayGlire track" set xlabel " phastCons60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for Euarchontoglires # setup euarchontoglires-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # euarchontoglires-only: get the euarchontoglires only tree from the 4d directory cp -p ../../4d/euarchontoglires.mod ./euarchontoglires.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/euarchontoglires.list > euarchontoglires.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 euarchontoglires.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > euarchontoglires.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 17421s 290.36m 4.84h 0.20d 0.001 y # IO & Wait Time: 37430s 623.83m 10.40h 0.43d 0.001 y # Average job time: 175s 2.91m 0.05h 0.00d # Longest finished job: 343s 5.72m 0.10h 0.00d # Submission to last job: 2403s 40.05m 0.67h 0.03d cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m32.945s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m19.122s featureBits mm10 mostConserved.bed # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m21.506s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires time nice -n +19 hgLoadBed mm10 phastConsElements60wayEuarchontoGlires \ mostConserved.bed # Loaded 2327130 elements of size 6 # real 0m24.591s # verify coverage time featureBits mm10 phastConsElements60wayEuarchontoGlires # 127113541 bases of 2652783500 (4.792%) in intersection # real 0m18.857s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayEuarchontoGlires # refGene:cds 1.282%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.929%, cover 72.46%, enrich 15.12x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayEuarchontoGlires # knownGene:cds 1.325%, phastConsElements60wayEuarchontoGlires 4.792%, # both 0.943%, cover 71.16%, enrich 14.85x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayEuarchontoGlires.wig phastCons60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m49.080s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayEuarchontoGlires.bw \ > bigWig.log 2>&1 ) & # real 26m0.111s bigWigInfo phastCons60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,411,704,465 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.133253 min: 0.000000 max: 1.000000 std: 0.256320 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayEuarchontoGlires; \ create table phastCons60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phastCons60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phastCons60wayEuarchontoGlires.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/euarchontoglires ln -s `pwd`/phastCons60wayEuarchontoGlires.wib \ /gbdb/mm10/multiz60way/phastCons60wayEuarchontoGlires.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayEuarchontoGlires phastCons60wayEuarchontoGlires.wig # real 0m50.676s time wigTableStats.sh mm10 phastCons60wayEuarchontoGlires # db.table min max mean count sumData mm10.phastCons60wayEuarchontoGlires 0 1 0.133253 1929686275 2.57137e+08 # stdDev viewLimits # 0.25632 viewLimits=0:1 # real 0m21.964s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayEuarchontoGlires > histogram.data 2>&1 # real 3m31.112s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayEuarchontoGlires track" set xlabel " phastCons60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################## ### Create a phastCons data set for primate ***### This was constructed ### and examined, but not used in the release # setup primate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/primate cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # primate-only: get the primate only tree from the 4d directory cp -p ../../4d/primate.mod ./primate.mod # and all the others become the non-informative list for phastCons to ignore cat ../../4d/glire.list ../../4d/placental.list ../../4d/vertebrate.list \ | grep -v mm10 | sort | xargs echo | sed -e "s/ /,/g" \ > primate.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -ram=8g create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 13884s 231.39m 3.86h 0.16d 0.000 y # IO & Wait Time: 130791s 2179.86m 36.33h 1.51d 0.004 y # Average job time: 461s 7.68m 0.13h 0.01d # Longest finished job: 741s 12.35m 0.21h 0.01d # Submission to last job: 910s 15.17m 0.25h 0.01d cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m27.199s featureBits mm10 mostConserved.bed # 112908553 bases of 2652783500 (4.256%) in intersection # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate time nice -n +19 hgLoadBed mm10 phastConsElements60wayPrimate \ mostConserved.bed # Loaded 1119924 elements of size 6 # real 0m17.423s # verify coverage featureBits mm10 phastConsElements60wayPrimate # 112908553 bases of 2652783500 (4.256%) in intersection # real 0m13.684s # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayPrimate # refGene:cds 1.281%, phastConsElements60wayPrimate 4.256%, # both 0.897%, cover 69.98%, enrich 16.44x featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPrimate # knownGene:cds 1.325%, phastConsElements60wayPrimate 4.256%, # both 0.909%, cover 68.64%, enrich 16.13x featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPrimate # ensGene:cds 1.357%, phastConsElements60wayPrimate 4.256%, both 0.913%, # cover 67.30%, enrich 15.81x # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz done # Create merged posterier probability file and wiggle track data files zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPrimate.wig phastCons60wayPrimate.wib # Converted stdin, upper limit 1.00, lower limit 0.00 # real 12m22.465s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phastCons60wayPrimate.bw # real 31m44.517s bigWigInfo phastCons60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,431,379,060 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.093847 min: 0.000000 max: 1.000000 std: 0.233892 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPrimate; \ create table phastCons60wayPrimate \ (fileName varchar(255) not null); \ insert into phastCons60wayPrimate values ("/gbdb/mm10/bbi/phastCons60wayPrimate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/primate ln -s `pwd`/phastCons60wayPrimate.wib \ /gbdb/mm10/multiz60way/phastCons60wayPrimate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPrimate phastCons60wayPrimate.wig # real 1m24.188s wigTableStats.sh mm10 phastCons60wayPrimate # db.table min max mean count sumData # mm10.phastCons60wayPrimate 0 1 0.0938475 1929686275 1.81096e+08 # 0.233892 viewLimits=0:1 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPrimate > histogram.data 2>&1 # real 7m3.198s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPrimate track" set xlabel " phastCons60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Placental # setup placental-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/placental cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # placental-only: get the placental only tree from the 4d directory cp -p ../../4d/placental.mod ./placental.mod # and all the others become the non-informative list for phastCons to ignore sort ../../4d/placental.list > placental.list sort ../../4d/vertebrate.list > vertebrate.list comm -13 placental.list vertebrate.list | xargs echo \ | sed -e "s/ /,/g" > placental.non-inf gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 314 of 314 jobs # CPU time in finished jobs: 27853s 464.21m 7.74h 0.32d 0.001 y # IO & Wait Time: 128981s 2149.69m 35.83h 1.49d 0.004 y # Average job time: 499s 8.32m 0.14h 0.01d # Longest finished job: 785s 13.08m 0.22h 0.01d # Submission to last job: 5970s 99.50m 1.66h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed # real 0m44.170s featureBits mm10 mostConserved.bed # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m54.927s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental time nice -n +19 hgLoadBed mm10 phastConsElements60wayPlacental \ mostConserved.bed # Loaded 5257437 elements of size 6 # real 0m56.788s # verify coverage, should be the same as the file measured above time featureBits mm10 phastConsElements60wayPlacental # 144041584 bases of 2652783500 (5.430%) in intersection # real 0m39.537s # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits mm10 -enrichment refGene:cds phastConsElements60wayPlacental # refGene:cds 1.282%, phastConsElements60wayPlacental 5.430%, # both 0.920%, cover 71.73%, enrich 13.21x # real 0m39.833s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayPlacental # knownGene:cds 1.325%, phastConsElements60wayPlacental 5.430%, # both 0.934%, cover 70.47%, enrich 12.98x # real 0m44.567s time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayPlacental # ensGene:cds 1.357%, phastConsElements60wayPlacental 5.430%, # both 0.941%, cover 69.32%, enrich 12.77x # real 0m43.093s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayPlacental.wig \ phastCons60wayPlacental.wib > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.237s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayPlacental.bw > bigWig.log 2>&1) & # real 25m18.556s bigWigInfo phastCons60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,271,676,156 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.135703 min: 0.000000 max: 1.000000 std: 0.266432 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayPlacental; \ create table phastCons60wayPlacental \ (fileName varchar(255) not null); \ insert into phastCons60wayPlacental values ("/gbdb/mm10/bbi/phastCons60wayPlacental.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/placental ln -s `pwd`/phastCons60wayPlacental.wib \ /gbdb/mm10/multiz60way/phastCons60wayPlacental.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayPlacental phastCons60wayPlacental.wig # real 0m41.999s time wigTableStats.sh mm10 phastCons60wayPlacental # db.table min max mean count sumData # mm10.phastCons60wayPlacental 0 1 0.135703 1929686275 2.61864e+08 # stdDev viewLimits # 0.266432 # viewLimits=0:1 # real 0m21.723s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayPlacental > histogram.data 2>&1 # real 2m39.659s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayPlacental track" set xlabel " phastCons60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### ### Create a phastCons data set for Vertebrate # setup vertebrate-only run ssh swarm mkdir /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # vertebrate-only: get the vertebrate only tree from the 4d directory cp -p ../../4d/vertebrate.mod ./vertebrate.mod # they are all in this one, no need for non-informative list gensub2 ../run.cons/ss.list single ../run.cons/template jobList para create jobList para try ... check ... push ... etc. # Completed: 313 of 314 jobs # Crashed: 1 jobs # CPU time in finished jobs: 36058s 600.97m 10.02h 0.42d 0.001 y # IO & Wait Time: 125496s 2091.59m 34.86h 1.45d 0.004 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 912s 15.20m 0.25h 0.01d # Submission to last job: 2681s 44.68m 0.74h 0.03d # the one failed job was completed manually on hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate # create Most Conserved track cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/${C} 2> /dev/null | while read D do cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m44.506s /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed > mostConserved.bed time featureBits mm10 mostConserved.bed # 172842314 bases of 2652783500 (6.516%) in intersection # real 1m23.298s # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate time nice -n +19 hgLoadBed mm10 phastConsElements60wayVertebrate \ mostConserved.bed # Read 6747163 elements of size 5 from mostConserved.bed # real 1m15.122s # verify coverage featureBits mm10 phastConsElements60wayVertebrate # 172842314 bases of 2652783500 (6.516%) in intersection # --rho 0.3 --expected-length 45 --target-coverage 0.3 featureBits mm10 -enrichment refGene:cds phastConsElements60wayVertebrate # refGene:cds 1.282%, phastConsElements60wayVertebrate 6.516%, # both 0.914%, cover 71.26%, enrich 10.94x time featureBits mm10 -enrichment ensGene:cds phastConsElements60wayVertebrate # ensGene:cds 1.357%, phastConsElements60wayVertebrate 6.516%, # both 0.942%, cover 69.39%, enrich 10.65x # real 0m51.139s time featureBits mm10 -enrichment knownGene:cds phastConsElements60wayVertebrate # knownGene:cds 1.325%, phastConsElements60wayVertebrate 6.516%, # both 0.930%, cover 70.18%, enrich 10.77x # real 0m51.545s # Create the downloads .pp files, from which the phastCons wiggle data # is calculated # sort by chromName, chromStart so that items are in numerical order # for wigEncode cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate mkdir downloads for D in `ls -d pp/chr* | sed -e 's#pp/##'` do echo "working: $D" find ./pp/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz done # Create merged posterier probability file and wiggle track data files time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phastCons60wayVertebrate.wig \ phastCons60wayVertebrate.wib > wigEncode.log 2>&1 ) & # Converted stdin, upper limit 1.00, lower limit 0.00 # real 9m48.554s # encode to bigWig # (warning wigToBigWig process grows to about 36 Gb) # in bash, to avoid the 32 Gb memory limit: export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes \ phastCons60wayVertebrate.bw > bigWig.log 2>&1) & # real 25m8.630s bigWigInfo phastCons60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,333,348,984 primaryIndexSize: 100,774,056 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.149646 min: 0.000000 max: 1.000000 std: 0.282502 # if desired to use the bigWig file, loading bigWig table: ln -s `pwd`/phastCons60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phastCons60wayVertebrate; \ create table phastCons60wayVertebrate \ (fileName varchar(255) not null); \ insert into phastCons60wayVertebrate values ("/gbdb/mm10/bbi/phastCons60wayVertebrate.bw");' ## load table with wiggle data ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz60way/cons/vertebrate ln -s `pwd`/phastCons60wayVertebrate.wib \ /gbdb/mm10/multiz60way/phastCons60wayVertebrate.wib time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phastCons60wayVertebrate phastCons60wayVertebrate.wig # real 0m45.432s time wigTableStats.sh mm10 phastCons60wayVertebrate # db.table min max mean count sumData # mm10.phastCons60wayVertebrate 0 1 0.149646 1929686275 2.8877e+08 # stdDev viewLimits # 0.282502 viewLimits=0:1 # real 0m22.224s # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ -db=mm10 phastCons60wayVertebrate > histogram.data 2>&1 # real 2m52.041s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phastCons60wayVertebrate track" set xlabel " phastCons60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # phyloP conservation for 60-way (DONE - 2012-06-15 - 2012-08-21 - Hiram) # # Vertebrate, Glire, Primate, Placental # # split SS files into 1M chunks, this business needs smaller files # to complete # many of these jobs run too much memory to finish on a kluster node # can run all of this on hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP mkdir ss run.split cd run.split cat << '_EOF_' > doSplit.csh #!/bin/csh -ef set c = $1 set MAF = /hive/data/genomes/mm10/bed/multiz60way/anno/result/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/ss/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $2 ) then exit 0 endif if ( -s $2.running ) then exit 0 endif date >> $2.running rm -fr $WINDOWS mkdir $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \ $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000 endif popd > /dev/null date >> $2 rm -f $2.running '_EOF_' # << happy emacs # do the easy ones first to see some immediate results ls -1S -r ../../anno/result | sed -e "s/.maf//;" > maf.list cat << '_EOF_' > template #LOOP ./doSplit.csh $(root1) $(root1).done #ENDLOOP '_EOF_' # << happy emacs gensub2 maf.list single template jobList # copy the jobList to runEm.sh, edit to make all the commands run in # the background, with wait statements every few commands to run # a small number of these at once, no more than four at once with # the large chroms, the small randoms can run a bunch at once, they # finish quickly. time ./runEm.sh # about 11h30m # run phyloP with score=LRT ssh swarm cd /cluster/data/mm10/bed/multiz60way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/all/all.mod 0.525 > all.mod grep BACKGROUND ../../cons/glire/glire.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.531 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/glire/glire.mod 0.531 > glire.mod grep BACKGROUND ../../cons/primate/primate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.509 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/primate/primate.mod 0.509 > primate.mod grep BACKGROUND ../../cons/euarchontoglires/euarchontoglires.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.518 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/euarchontoglires/euarchontoglires.mod 0.518 \ > euarchontoglires.mod grep BACKGROUND ../../cons/placental/placental.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/placental/placental.mod 0.525 > placental.mod grep BACKGROUND ../../cons/vertebrate/vertebrate.mod \ | awk '{printf "%0.3f\n", $3 + $4}' # 0.525 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../cons/vertebrate/vertebrate.mod 0.525 > vertebrate.mod cat << '_EOF_' > doPhyloP.csh #!/bin/csh -fex set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set ssFile = $1:t echo "ssFile: $ssFile" set out = $2 set cName = $f:h echo "cName: $cName" set n = $f:r:e set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/multiz60way/consPhyloP set tmp = $cons/tmp/$grp/$f rm -fr $tmp mkdir -p $tmp set ssSrc = "$cons/ss/$cName/$ssFile" set useGrp = "$grp.mod" ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null echo source: $ssSrc.ss $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \ -i SS $useGrp $ssSrc.ss > $ssFile.wigFix popd > /dev/null mkdir -p $out:h sleep 4 mv $tmp/$ssFile.wigFix $out rm -fr $tmp '_EOF_' # << happy emacs chmod +x doPhyloP.csh # Create list of chunks find ../ss -type f | sed -e "s/.ss$//; s#../ss/##;" > ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix cat << '_EOF_' > template #LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP '_EOF_' # << happy emacs ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2708 of 2708 jobs # CPU time in finished jobs: 1832980s 30549.67m 509.16h 21.22d 0.058 y # IO & Wait Time: 217434s 3623.90m 60.40h 2.52d 0.007 y # Average job time: 757s 12.62m 0.21h 0.01d # Longest finished job: 1458s 24.30m 0.41h 0.02d # Submission to last job: 3647s 60.78m 1.01h 0.04d # missed chrM in the original run: ../run.phyloP/doPhyloP.csh chrM/chrM.1-16296 wigFix/chrM/chrM.1-16296.wigFix ssh hgwdev cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/all mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phyloP60way.wigFix.gz done # real 38m15.538s zcat downloads/*.wigFix.gz \ | wigEncode stdin phyloP60way.wig phyloP60way.wib > wigEncode.log 2>&1 & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 27m53.384s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60way.bw) # real 30m10.440s bigWigInfo phyloP60way.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,533,501,426 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169761 min: -20.000000 max: 7.532000 std: 0.942744 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60way.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayAll; \ create table phyloP60wayAll \ (fileName varchar(255) not null); \ insert into phyloP60wayAll values ("/gbdb/mm10/bbi/phyloP60way.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60way.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayAll phyloP60way.wig # real 1m16.934s wigTableStats.sh mm10 phyloP60wayAll # db.table min max mean count sumData # mm10.phyloP60wayAll -20 7.532 0.169761 1929686275 3.27586e+08 # stdDev viewLimits # 0.942744 viewLimits=-4.54396:4.88348 # that range is: 4.54396+4.88348 = 9.42744 for -hBinSize=0.0942744 below # to get 1,000 bins # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.0942744 -hBinCount=1000 -hMinVal=-4.54396 -verbose=2 \ -db=mm10 phyloP60wayAll > histogram.data 2>&1 # real real 5m58.309s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60way track, all 60 vertebrates" set xlabel " phyloP60way score, all 60 vertebrates" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.2] set xrange [-2:2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the glire ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 206723s 3445.39m 57.42h 2.39d 0.007 y # IO & Wait Time: 256366s 4272.76m 71.21h 2.97d 0.008 y # Average job time: 171s 2.85m 0.05h 0.00d # Longest finished job: 487s 8.12m 0.14h 0.01d # Submission to last job: 1926s 32.10m 0.54h 0.02d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/glire mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.glire.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayGlire.wig phyloP60wayGlire.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.17, lower limit -4.35 # real 20m31.753s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayGlire.bw) & # real 37m9.063s bigWigInfo phyloP60wayGlire.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,158,091,915 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.073187 min: -4.346000 max: 1.165000 std: 0.602992 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayGlire.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayGlire; \ create table phyloP60wayGlire \ (fileName varchar(255) not null); \ insert into phyloP60wayGlire values ("/gbdb/mm10/bbi/phyloP60wayGlire.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayGlire.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayGlire phyloP60wayGlire.wig # real 0m58.536s wigTableStats.sh mm10 phyloP60wayGlire # db.table min max mean count # mm10.phyloP60wayGlire -4.346 1.165 0.0731873 1929686275 1.41229e+08 # stdDev viewLimits # 0.602992 viewLimits=-2.94177:1.165 # that range is: 4.346+1.165 = 5.511 -> hBinSize=0.005511 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.005511 -hBinCount=1000 -hMinVal=-4.346 -verbose=2 \ -db=mm10 phyloP60wayGlire > histogram.data 2>&1 # real 8m23.088s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayGlire track" set xlabel " phyloP60wayGlire score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ################### Running the euarchontoglires ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 542547s 9042.45m 150.71h 6.28d 0.017 y # IO & Wait Time: 75914s 1265.23m 21.09h 0.88d 0.002 y # Average job time: 228s 3.80m 0.06h 0.00d # Longest finished job: 430s 7.17m 0.12h 0.00d # Submission to last job: 4149s 69.15m 1.15h 0.05d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/euarchontoglires mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.euarchontoglires.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayEuarchontoGlires.wig phyloP60wayEuarchontoGlires.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 1.75, lower limit -12.70 # real 10m52.064s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayEuarchontoGlires.bw) & # real 26m47.912s bigWigInfo phyloP60wayEuarchontoGlires.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 3,970,501,521 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.078739 min: -12.704000 max: 1.753000 std: 0.689759 # if you wanted to use the bigWig file, loading bigWig table: ln -s `pwd`/phyloP60wayEuarchontoGlires.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayEuarchontoGlires; \ create table phyloP60wayEuarchontoGlires \ (fileName varchar(255) not null); \ insert into phyloP60wayEuarchontoGlires values ("/gbdb/mm10/bbi/phyloP60wayEuarchontoGlires.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayEuarchontoGlires.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayEuarchontoGlires phyloP60wayEuarchontoGlires.wig # real 0m51.777s time wigTableStats.sh mm10 phyloP60wayEuarchontoGlires # db.table min max mean count # mm10.phyloP60wayEuarchontoGlires -12.704 1.753 0.0787387 1929686275 # sumData stdDev viewLimits # 1.51941e+08 0.689759 viewLimits=-3.37006:1.753 # real 0m26.197s # that range is: 12.704+1.753 = 14.457 -> hBinSize=0.014457 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.014457 -hBinCount=1000 -hMinVal=-12.704 -verbose=2 \ -db=mm10 phyloP60wayEuarchontoGlires > histogram.data 2>&1 # real 3m22.205s # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayEuarchontoGlires track" set xlabel " phyloP60wayEuarchontoGlires score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.15] set xrange [-2:1.2] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the primate ####################### ### ***### This was constructed ### and examined, but not used in the release mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/primate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para -ram=8g create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 307901s 5131.68m 85.53h 3.56d 0.010 y # IO & Wait Time: 42937s 715.62m 11.93h 0.50d 0.001 y # Average job time: 130s 2.16m 0.04h 0.00d # Longest finished job: 234s 3.90m 0.07h 0.00d # Submission to last job: 5975s 99.58m 1.66h 0.07d cd /cluster/data/mm10/bed/multiz60way/consPhyloP/run.phyloP/primate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.primate.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPrimate.wig phyloP60wayPrimate.wib \ > wigEncode.log 2>&1) & # real 9m37.055s # Converted stdin, upper limit 0.93, lower limit -10.63 export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPrimate.bw) & # real 24m18.842s bigWigInfo phyloP60wayPrimate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 2,715,332,211 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.060017 min: -10.633000 max: 0.930000 std: 0.518027 # loading bigWig table: ln -s `pwd`/phyloP60wayPrimate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPrimate; \ create table phyloP60wayPrimate \ (fileName varchar(255) not null); \ insert into phyloP60wayPrimate values ("/gbdb/mm10/bbi/phyloP60wayPrimate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPrimate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPrimate phyloP60wayPrimate.wig # real 0m45.837s wigTableStats.sh mm10 phyloP60wayPrimate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayPrimate -10.633 0.93 0.0600168 1929686275 1.15814e+08 # stdDev viewLimits # 0.518027 viewLimits=-2.53012:0.93 # that range is: 10.633+0.93 = 11.563 for the hBinSize=0.11563 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.11563 -hBinCount=1000 -hMinVal=-10.633 -verbose=2 \ -db=mm10 phyloP60wayPrimate > histogram.data 2>&1 # real 4m36.379s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Human Mm10 Histogram phyloP60wayPrimate track" set xlabel " phyloP60wayPrimate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.472] set xrange [-2.5:1.0] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the placental ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1188036s 19800.60m 330.01h 13.75d 0.038 y # IO & Wait Time: 209859s 3497.65m 58.29h 2.43d 0.007 y # Average job time: 516s 8.60m 0.14h 0.01d # Longest finished job: 1672s 27.87m 0.46h 0.02d # Submission to last job: 6336s 105.60m 1.76h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/placental mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.placental.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayPlacental.wig phyloP60wayPlacental.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 3.30, lower limit -20.00 # real 11m54.289s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayPlacental.bw \ > bigWig.log 2>&1) & # real 28m4.576s bigWigInfo phyloP60wayPlacental.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,423,832,009 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.109489 min: -20.000000 max: 3.296000 std: 0.810657 # loading bigWig table if that is what you wanted to do: ln -s `pwd`/phyloP60wayPlacental.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayPlacental; \ create table phyloP60wayPlacental \ (fileName varchar(255) not null); \ insert into phyloP60wayPlacental values ("/gbdb/mm10/bbi/phyloP60wayPlacental.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayPlacental.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayPlacental phyloP60wayPlacental.wig # real 0m50.284s wigTableStats.sh mm10 phyloP60wayPlacental # db.table min max mean count sumData # mm10.phyloP60wayPlacental -20 3.296 0.109489 1929686275 2.11279e+08 # stdDev viewLimits # 0.810657 viewLimits=-3.9438:3.296 # that range is: 20+3.296 = 23.296 for hBinSize=0.023296 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.023296 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayPlacental > histogram.data 2>&1 # real 3m24.650s # to see yrange: grep -v "^#" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayPlacental track" set xlabel " phyloP60wayPlacental score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.084] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ###################### Running the vertebrate ####################### mkdir /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList para create jobList para try ... check ... push ... etc ... para time # Completed: 2709 of 2709 jobs # CPU time in finished jobs: 1825414s 30423.56m 507.06h 21.13d 0.058 y # IO & Wait Time: 211040s 3517.34m 58.62h 2.44d 0.007 y # Average job time: 752s 12.53m 0.21h 0.01d # Longest finished job: 1530s 25.50m 0.42h 0.02d # Submission to last job: 6045s 100.75m 1.68h 0.07d cd /hive/data/genomes/mm10/bed/multiz60way/consPhyloP/vertebrate mkdir downloads for D in `ls -d wigFix/chr* | sed -e 's#wigFix/##'` do echo "working: $D" find ./wigFix/${D} -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/${D}.phastCons60way.vertebrate.wigFix.gz XXX - copy and paste error, should have been phyloP60way and not phastCons done time (zcat downloads/chr*.wigFix.gz \ | wigEncode stdin phyloP60wayVertebrate.wig phyloP60wayVertebrate.wib \ > wigEncode.log 2>&1) & # Converted stdin, upper limit 7.53, lower limit -20.00 # real 12m2.774s export sizeG=188743680 ulimit -d $sizeG ulimit -v $sizeG time (zcat downloads/chr*.wigFix.gz \ | wigToBigWig stdin ../../../../chrom.sizes phyloP60wayVertebrate.bw \ > bigWig.log 2>&1) & # real 27m6.791s bigWigInfo phyloP60wayVertebrate.bw version: 4 isCompressed: yes isSwapped: 0 primaryDataSize: 4,529,467,614 primaryIndexSize: 100,775,272 zoomLevels: 10 chromCount: 59 basesCovered: 1,929,686,275 mean: 0.169653 min: -20.000000 max: 7.532000 std: 0.942808 # loading bigWig table: ln -s `pwd`/phyloP60wayVertebrate.bw /gbdb/mm10/bbi hgsql mm10 -e 'drop table if exists phyloP60wayVertebrate; \ create table phyloP60wayVertebrate \ (fileName varchar(255) not null); \ insert into phyloP60wayVertebrate values ("/gbdb/mm10/bbi/phyloP60wayVertebrate.bw");' # loading the wiggle table: ln -s `pwd`/phyloP60wayVertebrate.wib /gbdb/mm10/multiz60way time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm10/multiz60way mm10 \ phyloP60wayVertebrate phyloP60wayVertebrate.wig # real 0m56.535s time wigTableStats.sh mm10 phyloP60wayVertebrate # db.table min max mean count sumData stdDev viewLimits # mm10.phyloP60wayVertebrate -20 7.532 0.169653 1929686275 3.27377e+08 # stdDev viewLimits # 0.942808 viewLimits=-4.54439:4.88369 # real 0m25.320s # that range is: 20+7.532 = 27.532 for hBinSize=0.027532 # Create histogram to get an overview of all the data time nice -n +19 hgWiggle -doHistogram \ -hBinSize=0.027532 -hBinCount=1000 -hMinVal=-20 -verbose=2 \ -db=mm10 phyloP60wayVertebrate > histogram.data 2>&1 # real 3m26.565s # to see yrange: egrep -v "^#|udcfileOpen" histogram.data | ave -col=5 stdin # create plot of histogram: cat << '_EOF_' | gnuplot > histo.png set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse Mm10 Histogram phyloP60wayVertebrate track" set xlabel " phyloP60wayVertebrate score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.1123] set xrange [-2.5:2.5] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines '_EOF_' # << happy emacs display histo.png & ######################################################################### # construct download files for 60-way (DONE - 2012-06-27 - 2012-08-21 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/alignments mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way mkdir /hive/data/genomes/mm10/bed/multiz60way/downloads cd /hive/data/genomes/mm10/bed/multiz60way/downloads mkdir multiz60way phastCons60way phyloP60way cd multiz60way mkdir maf alignments cd maf time cp -p ../../../anno/result/chr*.maf . # real 735m35.723s time gzip *.maf # real 700m23.340s md5sum *.maf.gz > md5sum.txt ln -s `pwd`/*.maf.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way/maf cd .. du -hsc maf # 24G maf du -hsc ../../anno/result/ # 244G ../../anno/result/ ln -s ../../mm10.60way.nh . ln -s ../../mm10.60way.commonNames.nh . ln -s `pwd`/*.nh \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phastCons60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phastCons cd glire ln -s ../../../cons/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/glire # real 5m50.001s cd ../euarchontoglire ln -s ../../../cons/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & # real 1m14.103s ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/euarchontoglire cd ../primate ln -s ../../../cons/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/primate # real 5m39.288s cd ../placental ln -s ../../../cons/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/placental # real 5m9.762s cd ../vertebrate ln -s ../../../cons/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/vertebrate # real 0m45.408s cd ../mm10.60way.phastCons ln -s ../../../cons/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way/mm10.60way.phastCons # real 6m11.158s cd .. ln -s ../../cons/all/all.mod mm10.60way.phastCons.mod ln -s ../../cons/glire/glire.mod mm10.60way.phastCons.glire.mod ln -s ../../cons/primate/primate.mod mm10.60way.phastCons.primate.mod ln -s ../../cons/euarchontoglires/euarchontoglires.mod mm10.60way.phastCons.euarchontoglire.mod ln -s ../../cons/placental/placental.mod mm10.60way.phastCons.placental.mod ln -s ../../cons/vertebrate/vertebrate.mod mm10.60way.phastCons.vertebrate.mod ln -s ../../cons/all/phastCons60way.bw mm10.60way.phastCons.bw ln -s ../../cons/glire/phastCons60wayGlire.bw \ mm10.60way.phastCons60wayGlire.bw ln -s ../../cons/placental/phastCons60wayPlacental.bw \ mm10.60way.phastCons60wayPlacental.bw ln -s ../../cons/euarchontoglires/phastCons60wayEuarchontoGlires.bw \ mm10.60way.phastCons60wayEuarchontoGlire.bw ln -s ../../cons/primate/phastCons60wayPrimate.bw \ mm10.60way.phastCons60wayPrimate.bw ln -s ../../cons/vertebrate/phastCons60wayVertebrate.bw \ mm10.60way.phastCons60wayVertebrate.bw time md5sum *.mod *.bw > md5sum.txt # real 20m11.260s # obtain the README.txt from hg19/phastCons46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons60way ##################################################################### cd /hive/data/genomes/mm10/bed/multiz60way/downloads/phyloP60way mkdir glire euarchontoglire primate placental vertebrate mm10.60way.phyloP60way cd glire ln -s ../../../consPhyloP/glire/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/glire # real 6m5.733s cd ../euarchontoglire ln -s ../../../consPhyloP/euarchontoglires/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/euarchontoglire # real 5m40.272s cd ../primate ln -s ../../../consPhyloP/primate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/primate # real 7m22.623s cd ../placental ln -s ../../../consPhyloP/placental/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/placental # real 7m39.269s cd ../vertebrate ln -s ../../../consPhyloP/vertebrate/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/vertebrate cd ../mm10.60way.phyloP60way ln -s ../../../consPhyloP/all/downloads/chr*.gz . time md5sum *.gz > md5sum.txt & ln -s `pwd`/*.gz `pwd`/md5sum.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way/mm10.60way.phyloP60way # real 8m5.777s cd .. ln -s ../../consPhyloP/run.phyloP/all.mod mm10.60way.phyloP60way.mod ln -s ../../consPhyloP/run.phyloP/glire.mod ./mm10.phyloP.glire.mod ln -s ../../consPhyloP/run.phyloP/placental.mod ./mm10.phyloP.placental.mod ln -s ../../consPhyloP/run.phyloP/euarchontoglires.mod ./mm10.phyloP.euarchontoglire.mod ln -s ../../consPhyloP/run.phyloP/primate.mod ./mm10.phyloP.primate.mod ln -s ../../consPhyloP/run.phyloP/vertebrate.mod ./mm10.60way.vertebrate.mod ln -s ../../consPhyloP/all/phyloP60way.bw mm10.60way.phyloP60way.bw ln -s ../../consPhyloP/glire/phyloP60wayGlire.bw \ mm10.60way.phyloP60wayGlire.bw ln -s ../../consPhyloP/vertebrate/phyloP60wayVertebrate.bw \ mm10.60way.phyloP60wayVertebrate.bw ln -s ../../consPhyloP/placental/phyloP60wayPlacental.bw \ mm10.60way.phyloP60wayPlacental.bw ln -s ../../consPhyloP/euarchontoglires/phyloP60wayEuarchontoGlires.bw \ mm10.60way.phyloP60wayEuarchontoglire.bw ln -s ../../consPhyloP/primate/phyloP60wayPrimate.bw \ mm10.60way.phyloP60wayPrimate.bw time md5sum *.mod *.bw > md5sum.txt & # real 20m17.082s # obtain the README.txt from hg19/phyloP46way and update for this # situation ln -s `pwd`/*.mod `pwd`/*.bw `pwd`/md5sum.txt `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP60way ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm10/bed/multiz60way/downloads/maf # bash script #!/bin/sh for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm10 refGene:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm10 multiz60way \ stdin stdout \ -orgs=/hive/data/genomes/mm10/bed/multiz60way/species.list \ | gzip -c > upstream${S}.maf.gz echo "done upstream${S}.maf.gz" done # real 199m45.558s md5sum *.nh *.maf.gz > md5sum.txt # real 27m59.778s # obtain the README.txt from hg19/multiz46way and update for this # situation ln -s `pwd`/*.nh `pwd`/*.maf.gz `pwd`/*.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz60way ############################################################################# # hgPal downloads (DONE - 2012-07-05 - 2012-07-09 - Hiram) # FASTA from 60-way for refGene ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal cd /hive/data/genomes/mm10/bed/multiz60way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz60way export gp=refGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 & # real 93m34.376s mz=multiz60way gp=refGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m16.821s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=refGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ ######################################################################### # lastz nile tilapia oreNil2 (DONE - 2012-04-02 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil2 mkdir /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cd /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 cat << '_EOF_' > DEF # Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: nile tilapia oreNil2 SEQ2_DIR=/hive/data/genomes/oreNil2/oreNil2.2bit SEQ2_LEN=/hive/data/genomes/oreNil2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 108m51.232s cat fb.mm10.chainOreNil2Link.txt # 51909908 bases of 2652783500 (1.957%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzOreNil2.2012-04-11 lastz.oreNil2 # and for the swap mkdir /hive/data/genomes/oreNil2/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOreNil2.2012-04-11/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 9m8.213s cat fb.oreNil2.chainMm10Link.txt # 49704887 bases of 816084674 (6.091%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/oreNil2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # LASTZ pig susScr3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SusScr3 mkdir /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: pig SusScr3 SEQ2_DIR=/hive/data/genomes/susScr3/susScr3.2bit SEQ2_LEN=/hive/data/genomes/susScr3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1086m29.992s cat fb.mm10.chainSusScr3Link.txt # 681359766 bases of 2652783500 (25.685%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSusScr3.2012-04-13 lastz.susScr3 mkdir /hive/data/genomes/susScr3/bed/blastz.mm10.swap cd /hive/data/genomes/susScr3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 104m56.258s cat fb.susScr3.chainMm10Link.txt # 743574150 bases of 2525294057 (29.445%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/susScr3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ armadillo dasNov3 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DasNov3 mkdir /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # armadillo vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: armadillo DasNov3 SEQ2_DIR=/hive/data/genomes/dasNov3/dasNov3.2bit SEQ2_LEN=/hive/data/genomes/dasNov3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=200 BASE=/hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1125m34.124s cat fb.mm10.chainDasNov3Link.txt # 668529920 bases of 2652783500 (25.201%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzDasNov3.2012-04-13 lastz.dasNov3 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13 time doRecipBest.pl mm10 dasNov3 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 116m51.114s mkdir /hive/data/genomes/dasNov3/bed/blastz.mm10.swap cd /hive/data/genomes/dasNov3/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDasNov3.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 150m51.653s cat fb.dasNov3.chainMm10Link.txt # 695161920 bases of 3299882059 (21.066%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/dasNov3/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ cat felCat5 (DONE - 2012-04-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat5 mkdir /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat5 SEQ2_DIR=/hive/data/genomes/felCat5/felCat5.2bit SEQ2_LEN=/hive/data/genomes/felCat5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1029m54.494s cat fb.mm10.chainFelCat5Link.txt # 788544084 bases of 2652783500 (29.725%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzFelCat5.2012-04-13 lastz.felCat5 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13 time doRecipBest.pl mm10 felCat5 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 106m30.011s mkdir /hive/data/genomes/felCat5/bed/blastz.mm10.swap cd /hive/data/genomes/felCat5/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat5.2012-04-13/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m25.850s cat fb.felCat5.chainMm10Link.txt # 762344436 bases of 2364296207 (32.244%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/felCat5/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ naked mole rat hetGla2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job screen -S mm10HetGla2 mkdir /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # naked mole rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: naked mole rat HetGla2 SEQ2_DIR=/hive/data/genomes/hetGla2/hetGla2.2bit SEQ2_LEN=/hive/data/genomes/hetGla2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 690m7.626s cat fb.mm10.chainHetGla2Link.txt # 853221843 bases of 2652783500 (32.163%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzHetGla2.2012-04-14 lastz.hetGla2 mkdir /hive/data/genomes/hetGla2/bed/blastz.mm10.swap cd /hive/data/genomes/hetGla2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzHetGla2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.hetGla2.chainMm10Link.txt # 879356778 bases of 2314771103 (37.989%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/hetGla2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ dolphin turTru2 (DONE - 2012-04-14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10TurTru2 mkdir /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # dolphin vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: dolphin TurTru2 SEQ2_DIR=/hive/data/genomes/turTru2/turTru2.2bit SEQ2_LEN=/hive/data/genomes/turTru2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 624m36.508s cat fb.mm10.chainTurTru2Link.txt # 802921354 bases of 2652783500 (30.267%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzTurTru2.2012-04-14 lastz.turTru2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14 time doRecipBest.pl mm10 turTru2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 44m47.753s mkdir /hive/data/genomes/turTru2/bed/blastz.mm10.swap cd /hive/data/genomes/turTru2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTurTru2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 124m17.088s cat fb.turTru2.chainMm10Link.txt # 781169007 bases of 2332402443 (33.492%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/turTru2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ Gibbon nomLeu2 (DONE - 2012-04-14 - Hiram) screen -S mm10NomLeu2 mkdir /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cd /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 cat << '_EOF_' > DEF # gibbon vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Gibbon NomLeu2 SEQ2_DIR=/hive/data/genomes/nomLeu2/nomLeu2.2bit SEQ2_LEN=/hive/data/genomes/nomLeu2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10NomLeu2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 621m38.251s cat fb.mm10.chainNomLeu2Link.txt # 902774780 bases of 2652783500 (34.031%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzNomLeu2.2012-04-14 lastz.nomLeu2 mkdir /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap cd /hive/data/genomes/nomLeu2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNomLeu2.2012-04-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 92m24.775s cat fb.nomLeu2.chainMm10Link.txt # 889660339 bases of 2756609047 (32.274%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/nomLeu2/bed ln -s blastz.mm10.swap lastz.mm10 ##################################################################### # tRNAs track (DONE 2012-04-02 Chin) # # Please refer to the generic tRNS track build documentation # ~/kent/src/hg/makeDb/doc/tRNAsTrack.txt # for details about how the track was build. ############################################################################## # orfeome 2012-03-16 (markd) enabled ORFeome tracks in etc/genbank.conf and reload genbank ############################################################################ # construct liftOver to mm9 (DONE - 2012-04-30 - Hiram) screen -S 10 # manage this longish running job in a screen mkdir /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 cd /hive/data/genomes/mm10/bed/blat.mm9.2012-04-30 # check it with -debug first to see if it is going to work: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -debug -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # if that is OK, then run it: time doSameSpeciesLiftOver.pl -buildDir=`pwd` -bigClusterHub=swarm \ -ooc=/scratch/data/mm10/mm10.11.ooc \ -dbHost=hgwdev -workhorse=hgwdev mm10 mm9 > do.log 2>&1 # real 95m21.635s # verify this file exists: og -L /gbdb/mm10/liftOver/mm10ToMm9.over.chain.gz # -rw-rw-r-- 1 535855 Feb 9 12:07 /gbdb/mm9/liftOver/mm9ToMm10.over.chain.gz # and try out the conversion on genome-test from mm9 to mm10 ############################################################################ # EXONIPHY MM10, lifted from hg19 (DONE - braney 2012-05-29) # needed for ucscGenes building # create a syntenic liftOver chain file cd /cluster/data/hg19/bed/lastz.mm10/axtChain time nice -n +19 netFilter -syn hg19.mm10.net.gz \ | netChainSubset -verbose=0 stdin hg19.mm10.all.chain.gz stdout \ | chainStitchId stdin stdout | gzip -c > hg19.mm10.syn.chain.gz #real 2m38.915s #user 3m29.458s #sys 0m16.033s # slightly smaller than the ordinary liftOver chain file: -rw-rw-r-- 1 78419424 Mar 7 18:40 hg19.mm10.over.chain.gz -rw-rw-r-- 1 74588027 May 29 12:29 hg19.mm10.syn.chain.gz # exoniphyMm9.gp is prepared as follows mkdir /cluster/data/mm10/bed/exoniphy cd /cluster/data/mm10/bed/exoniphy hgsql hg19 -e "select * from exoniphy" -N | cut -f 2-16 > exoniphyHg19.gp time nice -n +19 liftOver -genePred exoniphyHg19.gp \ /cluster/data/hg19/bed/lastz.mm10/axtChain/hg19.mm10.syn.chain.gz \ exoniphyMm10.gp unmapped # real 16m0.334s # user 15m46.462s # sys 0m7.115s wc -l * # 186601 exoniphyHg19.gp # 178821 exoniphyMm10.gp # 15560 unmapped cd /cluster/data/mm10/bed/exoniphy nice -n +19 hgLoadGenePred -genePredExt mm10 exoniphy exoniphyMm10.gp nice -n +19 featureBits mm10 exoniphy # 26795543 bases of 2652783500 (1.010%) in intersection nice -n +19 featureBits mm9 exoniphy # 25931742 bases of 2620346127 (0.990%) in intersection ############################################################################## # LASTZ cow bosTau6 (DONE - 2012-06-19 - Chin) # establish a screen to control this job with a name to indicate # what it is screen -S mm10BosTau6 mkdir /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 cd /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # cow vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cow BosTau6 SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 212m21.604s cat fb.mm10.chainBosTau6Link.txt # 700039696 bases of 2652783500 (26.389%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzBosTau6.2012-06-19 lastz.bosTau6 # swap mkdir /hive/data/genomes/bosTau6/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau6/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau6.2012-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 72m13.925s cat fb.bosTau6.chainMm10Link.txt # 688651806 bases of 2649682029 (25.990%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/bosTau6/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # lastz Medium Ground Finch geoFor1 (DONE - 2012-07-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10 mkdir /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cd /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 cat << '_EOF_' > DEF # Mouse vs. medium ground finch BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Medium Ground Finch GeoFor1 SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 251m4.194s cat fb.mm10.chainGeoFor1Link.txt # 93984241 bases of 2652783500 (3.543%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzGeoFor1.2012-07-29 lastz.geoFor1 # and for the swap mkdir /hive/data/genomes/geoFor1/bed/blastz.mm10.swap cd /hive/data/genomes/geoFor1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGeoFor1.2012-07-29/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 10m0.875s cat fb.geoFor1.chainMm10Link.txt # 80273915 bases of 1041286029 (7.709%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/geoFor1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # construct assembly fragments table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/bed/assemblyFrags cd /hive/data/genomes/mm10/bed/assemblyFrags zgrep -h -v "^#" "${F}" zgrep -h -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/AGP/*.comp.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e 's/CM000994.2/chr1/; s/CM000995.2/chr2/; s/CM000996.2/chr3/; s/CM000997.2/chr4/; s/CM000998.2/chr5/; s/CM000999.2/chr6/; s/CM001000.2/chr7/; s/CM001001.2/chr8/; s/CM001002.2/chr9/; s/CM001003.2/chr10/; s/CM001004.2/chr11/; s/CM001005.2/chr12/; s/CM001006.2/chr13/; s/CM001007.2/chr14/; s/CM001008.2/chr15/; s/CM001009.2/chr16/; s/CM001010.2/chr17/; s/CM001011.2/chr18/; s/CM001012.2/chr19/; s/CM001013.2/chrX/; s/CM001014.2/chrY/;' > chr.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' \ | awk '{printf "%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ | sed -e "s#GL456233.1#chrX_GL456233_random#; s#GL456216.1#chr4_GL456216_random#; s#JH584299.1#chr5_JH584299_random#; s#JH584301.1#chrY_JH584301_random#; s#JH584300.1#chrY_JH584300_random#; s#JH584303.1#chrY_JH584303_random#; s#JH584302.1#chrY_JH584302_random#; s#JH584298.1#chr5_JH584298_random#; s#JH584297.1#chr5_JH584297_random#; s#JH584296.1#chr5_JH584296_random#; s#JH584295.1#chr4_JH584295_random#; s#JH584294.1#chr4_JH584294_random#; s#JH584293.1#chr4_JH584293_random#; s#JH584292.1#chr4_JH584292_random#; s#GL456354.1#chr5_GL456354_random#; s#GL456350.1#chr4_GL456350_random#; s#GL456221.1#chr1_GL456221_random#; s#GL456219.1#chr7_GL456219_random#; s#GL456213.1#chr1_GL456213_random#; s#GL456212.1#chr1_GL456212_random#; s#GL456211.1#chr1_GL456211_random#; s#GL456210.1#chr1_GL456210_random#;" > chrUL.asmFrag.bed zgrep -h -v "^#" ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/*.agp.gz \ | awk '$5 != "N"' | sed -e 's/\.1\t/\t/' \ | awk '{printf "chrUn_%s\t%d\t%d\t%s\t0\t%s\n", $1,$2-1,$3,$6,$9}' \ > chrUn.asmFrag.bed cat chr.asmFrag.bed chrUL.asmFrag.bed chrUn.asmFrag.bed > mm10.asmFrag.bed # add the chrM identity echo -e "chrM\t0\t1629\tAY172335.1\t0\t+" >> mm10.asmFrag.bed hgLoadBed mm10 assemblyFrags mm10.asmFrag.bed featureBits mm10 assemblyFrags # 2652769048 bases of 2652783500 (99.999%) in intersection # should be silent when all chr names are correct: checkTableCoords mm10 assemblyFrags ######################################################################### # construct ucscToEnsembl table (DONE - 2012-09-11 - Hiram) mkdir /hive/data/genomes/mm10/ensembl cd /hive/data/genomes/mm10/ensembl wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.toplevel.fa.gz' wget --timestamping \ 'ftp://ftp.ensembl.org/pub/release-68/fasta/mus_musculus/dna/Mus_musculus.GRCm38.68.dna.nonchromosomal.fa.gz' faCount *.fa.gz > faCount.txt egrep -v "total|seq" faCount.txt | awk '{print $1,$2}' \ | sort -u | sort -k2nr | sed -e "s/ /\t/" > ensembl.chrom.sizes mkdir /hive/data/genomes/mm10/bed/ucscToEnsembl cd /hive/data/genomes/mm10/bed/ucscToEnsembl awk '{printf "%d\t%s\n", $2,$1}' ../../chrom.sizes | sort > sizes.chrom.ucsc awk '{printf "%d\t%s\n", $2,$1}' ../../ensembl/ensembl.chrom.sizes \ | sort > sizes.chrom.ensembl join sizes.chrom.ucsc sizes.chrom.ensembl \ | awk '{printf "%s\t%s\n", $2,$3}' > ucscToEnsembl.tab cut -f1 ucscToEnsembl.tab | awk '{print length($1)}' | sort -rn | head -1 # 20 cat << '_EOF_' > ucscToEnsembl.sql # UCSC to Ensembl chr name translation CREATE TABLE ucscToEnsembl ( ucsc varchar(255) not null, # UCSC chromosome name ensembl varchar(255) not null, # Ensembl chromosome name #Indices PRIMARY KEY(ucsc(20)) ); '_EOF_' hgLoadSqlTab mm10 ucscToEnsembl ucscToEnsembl.sql ucscToEnsembl.tab ######################################################################### # GRC Incident database (DONE - 2012-09-21 - Hiram) # updated the automatic scripts to include the build of this track # on Mm10 # this procedure is run as a cron job in Hiram's account: # 43 09 * * * /hive/data/outside/grc/incidentDb/runUpdate.sh makeItSo # using the two scrips there: runUpdate.sh and update.sh # which are checked into the source tree as files: # src/hg/utils/automation/grcIncidentUpdate.sh # src/hg/utils/automation/grcRunIncidentUpdate.sh # they fetch the XML files from NCBI, convert them to SQL text # files, construct a bigBed file, and pushes it to genomewiki if # it is an update from previous # the table in the dataBase is: grcIncidentDb # which is the URL to the bb file, a single row: # http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb # construct the table after running the script once manually: hgBbiDbLink mm10 grcIncidentDb \ "http://genomewiki.ucsc.edu/images/a/a4/Mm10.grcIncidentDb.bb" ######################################################################### # GRCm38.p1 patch 1 (DONE - 2012-09-21 - Hiram) mkdir /hive/data/genomes/mm10/bed/patch1 cd /hive/data/genomes/mm10/bed/patch1 rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_mammals/Mus_musculus/GRCm38.p1/ ./genbank/ # slight modifications to this script from hg19 patch9 work: ./gatherNames.pl genbank > ucscNames.patch1.txt # examine the names for sanity: awk '{print $NF}' ucscNames.patch1.txt | sort # and they should not be longer than 31 characters: awk '{print $NF}' ucscNames.patch1.txt | sort | awk '{print length($0)}' \ | sort -n | tail # script from hg19 patch9, update the variable patchName ./mkTables.pl patches.chrom.sizes ucscNames.patch1.txt genbank/PATCHES/alt_scaffolds/AGP/alt.scaf.agp.gz # output to stdout is the contents of alt.scaf.agp.gz # constructs ctgPos.txt chromInfo.txt gap.txt gold.txt # script from hg19 patch9, update the variable patchName ./mkCtgPos2.pl ucscNames.patch1.txt patches.chrom.sizes > ctgPos2.txt cp -p ../patch5/mkHapLocate.pl . ./mkHapLocate.pl ctgPos.txt \ PATCHES/alt_scaffolds/alt_scaffold_placement.txt \ > haplotypeLocations.bed cp -p haplotypeLocations.bed altSequence.bed ./mkFasta.pl ucscNames.patch1.txt > mm10.patch1.fa # the build of mm10Patch1 can be seen in mm10Patch1.txt egrep -v "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqPatchesP1.tab # no haplotypes yet, this is nothing: egrep "32,32,190" altSequence.bed \ | awk '{printf "%s\t%d\t%d\t%s\t%d\t%s\n", $1,$2,$3,$4,$5,$6}' \ > altSeqHaplotypesP1.tab # verify none lost wc -l altSequence.bed altSeqPatchesP1.tab altSeqHaplotypesP1.tab # 9 altSequence.bed # 9 altSeqPatchesP1.tab # 0 altSeqHaplotypesP1.tab # not necessary, there are none yet: hgLoadBed mm10 altSeqHaplotypesP1 altSeqHaplotypesP1.tab # Loaded 75 elements of size 6 hgLoadBed mm10 altSeqPatchesP1 altSeqPatchesP1.tab # Read 9 elements of size 6 from altSeqPatchesP1.tab # these tables are part of mouse/mm10/altSeqComposite1.ra ############################################################################## # Haplotype track (WORKING - 2012-10-01 - Hiram) # Warning: these are all actually alternate scaffolds from OTHER mouse strains # These haplotypes are NOT from mm10. Probably the table should have been called NonMm10Haplotypes! # The directory after genbank/ identifies the strain, e.g. 129S2_SvPas #../../../mm10/genbank/129S2_SvPas/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129P2_OlaHsd/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/NOD_ShiLtJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/A_J/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/CAST_Ei/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129X1_SvJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/AKR_J/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/RIII/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S6_SvEvTac/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S7_SvEvBrd-Hprt-b-m2/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/BALB_c/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/129S1_SvImJ/alt_scaffolds/alt_scaffold_placement.txt #../../../mm10/genbank/NOD_MrkTac/alt_scaffolds/alt_scaffold_placement.txt cat << '_EOF_' > mkBedFile.pl #!/usr/bin/env perl use strict; use warnings; my $debug = 1; sub usage() { print STDERR "usage: ./mkBedFile.pl ../../mm10/genbank > mm10Haplotypes.bed\n"; print STDERR "expecting the Mus_musculus/GRCm38.p1/ hierarchy in ./genbank from NCBI\n"; exit 255; } my $argc = scalar(@ARGV); if ($argc != 1) { usage; } my $patchDir = shift; if ( ! -d $patchDir ) { print STDERR "ERROR: given directory $patchDir is not a directory or does not exist"; usage; } my %glSize; my %ctgToChr; my %ctgToFastaName; # my $fasta = "$patchDir/PATCHES/alt_scaffolds/FASTA/alt.scaf.fa.gz"; my @placeList = split('\n',`find $patchDir -type f | grep placement.txt | grep alt_scaffolds | grep -v UNKNOWN`); for (my $i = 0; $i < scalar(@placeList); ++$i) { printf STDERR "# %s\n", $placeList[$i]; open (FH, "grep -v '^#' $placeList[$i]|") or die "can not read $placeList[$i]"; while (my $line = <FH>) { # printf STDERR "%s", $line; chomp $line; my @a = split('\s+', $line); next if ($a[11] eq "na"); $a[8] = "+" if ($a[8] eq "b"); my $descr = sprintf("<B>Region name: </B>%s", $a[7]); printf "chr%s\t%d\t%d\t%s\t0\t%s\t%s\t%s\n", $a[5], $a[11], $a[12], $a[0], $a[8], $a[3], $descr; } close (FH); } '_EOF_' # << happy emacs chmod +x mkBedFile.pl ./mkBedFile.pl > mm10Haplotypes.bedDetail cat << '_EOF_' > mm10Haplotypes.sql CREATE TABLE mm10Haplotypes ( chrom varchar(255) not null, # Reference sequence chromosome or scaffold chromStart int unsigned not null, # Start position in chromosome chromEnd int unsigned not null, # End position in chromosome name varchar(255) not null, # Short Name of item score int unsigned, # Score from 0-1000 strand char(1), # + or - id varchar(255) not null, # ID to bed used in URL to link back description longblob not null, # Long description of item for the details page #Indices INDEX(chrom, chromStart) ); '_EOF_' hgLoadSqlTab mm10 mm10Haplotypes mm10Haplotypes.sql mm10Haplotypes.bedDetail # trackDb entry: track mm10Haplotypes shortLabel Alt. strains longLabel Alternate mouse strains, mapped to reference as haplotypes group varRep priority 111 visibility hide type bedDetail 8 url http://www.ncbi.nlm.nih.gov/nuccore/$$ urlLabel NCBI Nucleotide: ########################################################################## ## CYTOBAND - ideogram track (DONE - 2012-10-19 - Hiram) ssh hgwdev mkdir -p /hive/data/outside/ncbi/ideogram/2012-10 cd /hive/data/outside/ncbi/ideogram/2012-10 # fetch all the ideogram files: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/pub/gdp/ ./ mkdir /hive/data/genomes/mm10/bed/cytoband cd /hive/data/genomes/mm10/bed/cytoband # Create bed file $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl \ /hive/data/outside/ncbi/ideogram/2012-10/ideogram_10090_GCF_000000055.19_NA_V2 ## can now verify before load: $HOME/kent/src/utils/ncbi/cytoBandVerify.pl # everything checks out OK on 21 chroms # Load the bed file hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \ mm10 cytoBand cytoBand.bed # Read 403 elements of size 5 from cytoBand.bed # Make cytoBandIdeo track for ideogram gif on hgTracks page. # For mouse cytoBandIdeo is just a replicate of the cytoBand track. hgsql -e "drop table cytoBandIdeo;" mm10 hgsql mm10 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;" ########################################################################## # CYTOBANDIDEO update - (DONE - 2013-02-27 - kuhn) # adding rows for chroms with no cytology # this is just for navigation/orientation on those chroms set db=mm10 set sql=~/kent/src/hg/lib/cytoBandIdeo.sql # make backup of existing table hgsql -e "CREATE TABLE cytoBandIdeoCopy SELECT * FROM cytoBandIdeo" $db # dump existing table hgsql -N -e "SELECT * FROM cytoBandIdeo" $db > $db.cytoBandIdeo # find chroms already covered hgsql -N -e 'SELECT chrom FROM cytoBandIdeo' $db \ | sort -u > $db.coveredNames # make cytoBand records for chroms not already covered hgsql -N -e 'SELECT chrom, size FROM chromInfo' $db \ | grep -wvf $db.coveredNames \ | awk '{print $1"\t0\t"$2"\t\tgneg"}' > $db.cytoBandNew # check wc -l $db.* # combine and sort cat $db.cytoBandNew $db.cytoBandIdeo > $db.cytoBandIdeoFull bedSort $db.cytoBandIdeoFull $db.cytoBandIdeoFull # replace exsting table hgsql -e "DROP TABLE cytoBandIdeo" $db hgLoadSqlTab $db cytoBandIdeo $sql $db.cytoBandIdeoFull # check and then drop copy ########################################################################## # lastz Lamprey petMar2 (DONE - 2012-10-17 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S petMar2 mkdir /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cd /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 cat << '_EOF_' > DEF # Mouse vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Lamprey PetMar2 SEQ2_DIR=/hive/data/genomes/petMar2/petMar2.2bit SEQ2_LEN=/hive/data/genomes/petMar2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -qRepeats=windowmaskerSdust \ -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 & # real 218m29.078s cat fb.mm10.chainPetMar2Link.txt # 28262565 bases of 2652783500 (1.065%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzPetMar2.2012-10-19 lastz.petMar2 # and for the swap mkdir /hive/data/genomes/petMar2/bed/blastz.mm10.swap cd /hive/data/genomes/petMar2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar2.2012-10-19/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 & # real 7m2.754s cat fb.petMar2.chainHg19Link.txt # 20923095 bases of 647368134 (3.232%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/petMar2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # lastz White Rhino cerSim1 (DONE - 2012-10-23 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CerSim1 mkdir /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cd /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 cat << '_EOF_' > DEF # Mouse vs. White Rhino BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: White Rhino CerSim1 SEQ2_DIR=/hive/data/genomes/cerSim1/cerSim1.2bit SEQ2_LEN=/hive/data/genomes/cerSim1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 992m45.890s cat fb.mm10.chainCerSim1Link.txt # 942281365 bases of 2652783500 (35.520%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzCerSim1.2012-10-23 lastz.cerSim1 # and for the swap mkdir /hive/data/genomes/cerSim1/bed/blastz.mm10.swap cd /hive/data/genomes/cerSim1/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCerSim1.2012-10-23/DEF \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -swap -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 62m44s cat fb.cerSim1.chainMm10Link.txt # 926131511 bases of 2366858012 (39.129%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/cerSim1/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # QPCR PRIMERS (DONE - 2012-12-10 - Chin) # The track name is changed to "qPCR Primers" # Reload table with new track_mouse.BED (2013-01-28) # Download mkdir /hive/data/outside/Weizmann/qPcrPrimers cd /hive/data/outside/Weizmann/qPcrPrimers wget http://www.weizmann.ac.il/complex/compphys/software/Amit/primers/mouse/track_mouse.BED mkdir -p /hive/data/genomes/mm10/bed/qPcrPrimers cat track_mouse.BED | grep -v track \ > /hive/data/genomes/mm10/bed/qPcrPrimers/qPcrPrimers_mm10.bed cd /hive/data/genomes/mm10/bed/qPcrPrimers hgLoadBed -bedDetail -tab -renameSqlTable \ -sqlTable=$HOME/kent/src/hg/lib/bedDetail.sql \ mm10 qPcrPrimers qPcrPrimers_mm10.bed # Reading qPcrPrimers_mm10.bed # Read 518230 elements of size 14 from qPcrPrimers_mm10.bed # Sorted # Creating table definition for qPcrPrimers # Saving bed.tab # Loading mm10 # NULL descrition column hgsql mm10 -ne "UPDATE qPcrPrimers SET description = NULL;" ######################################################################### # DBSNP B137 / SNP137 (DONE 12/20/12 angie) # Redmine #7043 mkdir -p /hive/data/outside/dbSNP/137/mouse cd /hive/data/outside/dbSNP/137/mouse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_*_([0-9]+_[0-9]) # -- use the first num for build and the second num_num for buildAssembly. # jkStuff/liftContigs.lft maps NCBI contig names to chroms; use that for liftUp. # # Some trial and error was required to get the config.ra just right -- # the b* filenames don't include buildAssembly! # patch contigs needed to be filtered out: cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/genomes/mm10/jkStuff/liftContigs.lft EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log # Script ended with feedback about needing refAssemblyLabel because dbSNP # mapped to more than one assembly; add the label that clearly corresponds to # mm10, GRCm38, to config.ra and try again: cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/outside/dbSNP/137/mouse/suggested.lft refAssemblyLabel GRCm38 EOF ~/kent/src/hg/utils/automation/doDbSnp.pl -continue=loadDbSnp \ config.ra >>& do.log & tail -f do.log # Script ended with feedback about unrecognized NT_* contigs from dbSNP. # Inspect the script-generated suggested.lft for liftUp; it's usually right. # For contigs that are labeled as part of GRCm38 but not liftable to mm10, # listed in script-generated cantLiftUpSeqNames.txt, do some entrez # nucleotide searches for contig IDs and convince yourself that they're all # for alt assembly sequences that we don't include in mm10 (e.g. patches, # other strains). Then tell the script to filter out those contigs: cut -f 2 cantLiftUpSeqNames.txt > ignoreAltAssemblyContigs.txt cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 137 buildAssembly liftUp /hive/data/outside/dbSNP/137/mouse/suggested.lft refAssemblyLabel GRCm38 ignoreDbSnpContigsFile /hive/data/outside/dbSNP/137/mouse/ignoreAltAssemblyContigs.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl -continue=loadDbSnp \ config.ra >>& do.log & tail -f do.log # ... #MultipleAlignments 1667342 This variant aligns in more than one location. #ObservedMismatch 4561144 UCSC reference allele does not match any observed allele from dbSNP. # # *** All done! # That is an unusually high count of ObservedMismatch... follow up with dbSNP. ############################################################################# # FILTER SNP137 (DONE 12/21/12 angie) # Redmine #7043 # Make several tracks that are filtered subsets of snp137: # First, filter out the multiply-aligned and/or weight >1 SNPs -> snp137Mult # Second, siphon off the common variants -> snp137Common # Third, take the (uniquely mapped, not known to be common) variants # w/dbSNP's "clinically-assoc" flag -> snp137Flagged cd /hive/data/outside/dbSNP/137/mouse zcat snp137.bed.gz \ | perl -we \ '$minTotal2N = 10; \ ($multCount, $comCount, $flagCount, $miscCount) = (0,0,0,0); \ open($mult, "| gzip -c > snp137Mult.bed.gz") || die; \ open($common, "| gzip -c > snp137Common.bed.gz") || die; \ open($flagged, "| gzip -c > snp137Flagged.bed.gz") || die; \ open($misc, "| gzip -c > snp137Misc.bed.gz") || die; \ while (<>) { \ @w = split("\t"); \ if ($w[16] > 1 || $w[17] =~ /MultipleAlignments/) { \ print $mult $_; \ $multCount++; \ } else { \ my ($alleleFreqCount, $nStr, $freqStr) = ($w[20], $w[22], $w[23]); \ my @alNs = split(",", $nStr); die unless scalar(@alNs) == $alleleFreqCount; \ my @freqs = split(",", $freqStr); die unless scalar(@freqs) == $alleleFreqCount; \ my ($total2N, $maxAlleleFreq) = (0, 0); \ for (my $i = 0; $i < $alleleFreqCount; $i++) { \ $total2N += $alNs[$i]; \ $maxAlleleFreq = $freqs[$i] if ($freqs[$i] > $maxAlleleFreq); \ } \ if ($alleleFreqCount >= 2 && $total2N >= $minTotal2N && $maxAlleleFreq <= 0.99) { \ print $common $_; \ $comCount++; \ } elsif($w[24] =~ /clinically-assoc/) { \ print $flagged $_; \ $flagCount++; \ } else { \ print $misc $_; \ $miscCount++; \ } \ } \ } \ close($mult); close($common); close($flagged); close($misc); \ print "snp137Mult: $multCount\nsnp137Common: $comCount\nsnp137Flagged: $flagCount\n" . \ "leftover: $miscCount\n";' #snp137Mult: 1671771 #snp137Common: 2709532 #snp137Flagged: 0 #leftover: 66537658 # It's expected for snp137Flagged to be empty because that's for human SNPs. # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp137$subset -sqlTable=snp137.sql snp137$subset.bed.gz end ############################################################################ # DBSNP CODING ANNOTATIONS (137) (DONE 12/21/12 angie) # Redmine #7043 cd /hive/data/outside/dbSNP/137/mouse # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #1884989 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 371388 3 (coding-synon) #1301099 8 (cds-reference -- ignored) # 3465 41 (nonsense) # 199148 42 (missense) # 319 43 (stop-loss) # 7422 44 (frameshift) # 2148 45 (cds-indel) # In b137, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp137CodingDbSnp.bed /hive/data/outside/dbSNP/137/mouse/suggested.lft warn stdin hgLoadBed mm10 snp137CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp137CodingDbSnp.bed #Read 552120 elements of size 11 from snp137CodingDbSnp.bed ######################################################################### # RETROPOSED GENES ucscRetro track VERSION 2 # (2013-04-03 - 2013-04-17, baertsch,hartera DONE) mkdir -p /hive/hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 mkdir -p /hive/data/genomes/mm10/bed/retro/ cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403 cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " RUNDATE="2013-04-03" DB=mm10 SCORETHRESH=510 GENOMENAME='Mus musculus' GBDB=mm DATE=20130403 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz TMPMRNA=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/mrnaBlastz/$DB TMPEST=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/est/$DB BINDIR=/hive/users/hartera/GencodeWG/retroFinder/trunk/bin EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/trunk/src/pipeline GENOME=/hive/data/genomes TWOBIT=$GENOME/$DB/$DB.2bit RETRODIR=$GENOME/$DB/bed/retro BASE=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/retro VERSION=2 OUTDIR=${BASE}/${DB}.${VERSION} RESULT=$OUTDIR/result LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o TABLE=ucscRetroInfo$VERSION ORTHOTABLE=ucscRetroOrtho$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB NIB=$LOCAL/nib RMSK=rmsk NET1=netHg19 NET2=netCanFam3 NET3=netRn5 GENE1=knownGene GENE2=refGene GENE3=ensGene CLUSTER=swarm SPECIES="hg19 mm10" ROOTDIR="/cluster/home/$USER/public_html/retro/mm10Apr13" WEBROOT=$ROOTDIR/retro.$RUNDATE WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE=affyGnf1m ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio ARRAYABS=hgFixed.gnfMouseAtlas2All ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps # ARRAYLOOKUP=knownToGnfAtlas2 #ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl" ALTSPLICE=sibTxGraph SPLITBYAGE=splitRetrosByAgeMouse PDB=proteins121210 BREAKS=0,8,16,24,32 XLIM=34 YLIM=0.1 YLIM1=4000 YLIM2=160 MAXDIVERGENCE=32 '_EOF_' # << happy emacs chmod +x DEF mkdir mrnaBlastz cd mrnaBlastz cp ../DEF . # Create S1.len: cp /hive/data/genomes/mm10/chrom.sizes S1.len # Edit S1.len and remove chrM and random chroms then copy over to mm10 # genomes directory mkdir -p /hive/data/genomes/mm10/bed/mrnaBlastz cp S1.len /hive/data/genomes/mm10/bed/mrnaBlastz screen # Run steps 1 to 6 of RetroFinder pipeline from scripts in CCDS SVN source tree: retroFinder/trunk/src/pipeline/ucscStep1.sh DEF # check cluster job on swarm retroFinder/trunk/src/pipeline/ucscStep2.sh DEF retroFinder/trunk/src/pipeline/ucscStep3.sh DEF #check cluster job retroFinder/trunk/src/pipeline/ucscStep4.sh DEF #check cluster job # Load the track retroFinder/trunk/src/pipeline/ucscStep5.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403/retro/mm10.2 retroFinder/trunk/src/pipeline/filterMrna.sh DEF retroFinder/trunk/src/pipeline/filterEst.sh DEF retroFinder/trunk/src/pipeline/analyseExpress.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20130403/mrnaBlastz retroFinder/trunk/src/pipeline/ucscStep6.sh DEF #added ucscRetroAli to trackDb.ra # copied # /hive/groups/gencode/pseudogenes/retroFinder/mm10/20130403/retro/mm10.2/trackDb.retro # entry to kent/src/hg/makeDb/trackDb/mouse/mm10/trackDb.ra # and edited it to add version number and date. # Scripts copied ucscRetroAli2.psl, ucscRetroInfo2.bed and ucscRetroCds2.tab # to /hive/data/genomes/mm10/bed/retro/ ############################################################################## # LASTZ shrew sorAra2 (DONE - 2013-06-12 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10SorAra2 mkdir /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 cd /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # shrew vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: shrew SorAra2 SEQ2_DIR=/hive/data/genomes/sorAra2/sorAra2.2bit SEQ2_LEN=/hive/data/genomes/sorAra2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=40 BASE=/hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 785m32.163s cat fb.mm10.chainSorAra2Link.txt # 354499462 bases of 2652783500 (13.363%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzSorAra2.2013-06-12 lastz.sorAra2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12 time doRecipBest.pl mm10 sorAra2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 24m38.069s mkdir /hive/data/genomes/sorAra2/bed/blastz.mm10.swap cd /hive/data/genomes/sorAra2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSorAra2.2013-06-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 39m53.463s cat fb.sorAra2.chainMm10Link.txt # 343760052 bases of 2192103426 (15.682%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/sorAra2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ tenrec echTel2 (DONE - 2013-06-12 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10EchTel2 mkdir /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 cd /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # tenrec vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: tenrec EchTel2 SEQ2_DIR=/hive/data/genomes/echTel2/echTel2.2bit SEQ2_LEN=/hive/data/genomes/echTel2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 1006m3.874s cat fb.mm10.chainEchTel2Link.txt # 384570981 bases of 2652783500 (14.497%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzEchTel2.2013-06-12 lastz.echTel2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12 time doRecipBest.pl mm10 echTel2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 27m58.816s # and, for the swap mkdir /hive/data/genomes/echTel2/bed/blastz.mm10.swap cd /hive/data/genomes/echTel2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEchTel2.2013-06-12/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 43m0.194s cat fb.echTel2.chainMm10Link.txt # 380872172 bases of 2605196361 (14.620%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/echTel2/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################## # LASTZ alpaca vicPac2 (DONE - 2013-06-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10VicPac2 mkdir /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 cd /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 cat << '_EOF_' > DEF # mouse vs alpaca BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.02/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: alpaca VicPac2 SEQ2_DIR=/hive/data/genomes/vicPac2/vicPac2.2bit SEQ2_LEN=/hive/data/genomes/vicPac2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 2156m48.687s cat fb.mm10.chainVicPac2Link.txt # 797843091 bases of 2652783500 (30.076%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/mm10/bed ln -s lastzVicPac2.2013-06-19 lastz.vicPac2 # better to have reciprocal best for this one since it is low coverage: cd /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19 time doRecipBest.pl mm10 vicPac2 -buildDir=`pwd` -workhorse=hgwdev \ > best.log 2>&1 & # real 33m49.271s mkdir /hive/data/genomes/vicPac2/bed/blastz.mm10.swap cd /hive/data/genomes/vicPac2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzVicPac2.2013-06-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 85m53.924s cat fb.vicPac2.chainMm10Link.txt # 783682127 bases of 2078582856 (37.703%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/vicPac2/bed ln -s blastz.mm10.swap lastz.mm10 ######################################################################### # create ucscToINSDC name mapping (DONE - 2013-08-15 - Hiram) # this allows the "ensembl" blue bar button to appear mkdir /hive/data/genomes/mm10/bed/ucscToINSDC cd /hive/data/genomes/mm10/bed/ucscToINSDC cat << '_EOF_' > translateNames.sh #!/bin/sh grep -v "^#" ../../genbank/Primary_Assembly/assembled_chromosomes/chr2acc \ | sed -e 's/^/chr/' zcat ../../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz | grep -v "^#" | cut -f1 | sort -u \ | sed -e 's/^\([A-Za-z0-9]*\).\([0-9]*\)/chrUn_\1\t\1.\2/;' grep -v "^#" \ ../../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf \ | sed -e 's/^\([A-Za-z0-9]*\)\t\([A-Za-z0-9]*\).\([0-9]*\)/chr\1_\2_random\t\2.\3/;' echo -e "chrM\tNC_005089.1" '_EOF_' # << happy emacs chmod +x translateNames.sh ./translateNames.sh | sort > ucscToINSDC.txt join <(sort ../../chrom.sizes) ucscToINSDC.txt \ | awk '{printf "%s\t0\t%d\t%s\n", $1, $2, $3}' > ucscToINSDC.tab # maximum size of UCSC chrom name for SQL index cut -f1 ucscToINSDC.tab | awk '{print length($0)}' | sort -n | tail -1 # 20 sed -e 's/21/20/' $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab mm10 ucscToINSDC stdin ucscToINSDC.tab # verify the track link to INSDC functions ############################################################################## # MGI LIFTOVER FROM mm9 ( 2013-11-14 Pauline) ssh kolossus mkdir /cluster/data/mm10/bed/jaxLiftOver cd /cluster/data/mm10/bed/jaxLiftOver liftOver -minBlocks=0.5 /cluster/data/mm9/bed/jax/2011_06/jaxQtl.bed \ /cluster/data/mm9/bed/liftOver/mm9ToMm10.over.chain.gz \ -bedPlus=6 -tab jaxQtlLift.{bed,unmapped} wc -l jaxQtlLift.{bed,unmapped} #Old 1539 jaxQtlLift.bed #Old 12 jaxQtlLift.unmapped # 1883 jaxQtlLift.bed # 14 jaxQtlLift.unmapped # Numbers are of same order of magnitude (yay?) proceeding... # Load lifted track tables and original auxiliary tables: ssh hgwdev cd /cluster/data/mm10/bed/jaxLiftOver # jaxQTLLift #didn't run this sed command (prob already been done to this file?) sed -e 's/jaxQTL/jaxQTLLift/g'\ ~/kent/src/hg/lib/jaxQTL.sql > jaxQTLLift.sql #ran this (used this instead of hgLoadBed at Hiram's suggestion): hgLoadSqlTab mm10 JaxQtl $HOME/kent/src/hg/lib/jaxQtl.sql \ /cluster/data/mm10/bed/jaxLiftOver/jaxQtlLift.bed checkTableCoords mm10 JaxQTLLift #got no output (yay!) #found out hgLoadSqlTab doesn't load a positionally sorted table, sorting bed #file and reloading: sort -k1,1 -k2,2n jaxQtlLift.bed > jaxQtlLiftSorted.bed hgLoadSqlTab mm10 jaxQtl $HOME/kent/src/hg/lib/jaxQtl.sql \ /cluster/data/mm10/bed/jaxLiftOver/jaxQtlLiftSorted.bed ############################################################################## # DBSNP B138 / SNP138 (DONE 1/17/14 angie) # RedMine #12490 screen mkdir -p /hive/data/outside/dbSNP/138/mouse cd /hive/data/outside/dbSNP/138/mouse # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 138 buildAssembly EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log # Some trial and error was required to get the config.ra just right. # First stop: need a refAssemblyLabel: # *** This release contains more than one assembly label. # *** Please examine this list in case we need to exclude any of these: # #GRCm38.p1 #Mm_Celera # *** Add refAssemblyLabel to config.ra. If keeping all labels, it will # *** look like this: # #refAssemblyLabel GRCm38.p1,Mm_Celera # # *** Edit out any of those that are not included in mm10 (e.g. Celera). # *** Then restart this script with -continue=loadDbSnp . cat >> config.ra <<EOF refAssemblyLabel GRCm38.p1 EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # Second stop: need to grab the NCBI Assembly Reports file for GRCm38; the # script will do its best to deduce the needed liftUp entries and contigs # to ignore (because they are for alternate mouse strains, or patch contigs etc). #*** b138_ContigInfo has coords for 119 sequences; these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . #*** 152 lines of b138_ContigInfo.bcp.gz either had no lift-coords #*** or had unrecognized chrom names; see #*** /hive/data/outside/dbSNP/138/mouse/cantLiftUpSeqNames.txt . # #*** You must account for those in config.ra, in the liftUp file #*** and/or ignoreDbSnpContigsFile or the ignoreDbSnpContigs regex. #*** Then run again with -continue=loadDbSnp . # #*** NOTE: If you add the ncbiAssemblyReportFile setting to config.ra and #*** run again with -continue=loadDbSnp, this script may be able to #*** construct those files for you. # Look at the doDbSnp.pl -help message for instructions about how to find the # Assembly Reports file for GRCm38 on the NCBI web site. wget ftp://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001635.22.assembly.txt cat >> config.ra <<EOF ncbiAssemblyReportFile GCF_000001635.22.assembly.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # Third stop: review the list of dbSNP contigs that we can't map, and if they're # all contigs not in our assembly, tell config.ra to ignore them. #*** b138_ContigInfo has coords for 119 sequences; these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . # #*** GCF_000001635.22.assembly.txt has mappings for 44 sequences; #*** these have been written to #*** /hive/data/outside/dbSNP/138/mouse/suggested.lft . # #*** 108 lines of b138_ContigInfo.bcp.gz contained contig names that #*** could not be mapped to chrom.size via their GenBank contig mappings; see #*** /hive/data/outside/dbSNP/138/mouse/cantLiftUpSeqNames.txt . # #*** You must account for all 271 contig_acc values in config.ra, #*** in the liftUp file and/or ignoreDbSnpContigsFile. #*** Then run again with -continue=loadDbSnp . cut -f 2 cantLiftUpSeqNames.txt > contigsNotInUCSC.txt cat >> config.ra <<EOF liftUp suggested.lft ignoreDbSnpContigsFile contigsNotInUCSC.txt EOF ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=loadDbSnp >>& do.log & tail -f do.log # The script died with an error implying that a perl command in a pipe got # empty input from sort which was getting input from an hgsql query to join # Batch submitter handles with rs# snp_id's. Looks like the mysql connection # was lost or something. Anyway, re-running that part of addToDbSnp.csh # in 2 parts and continuing manually through the end of addToDbSnp.csh: pushd `cat workingDir ` hgsql mm10snp138 -NBe 'select SNPSubSNPLink.snp_id, handle from SubSNP, SNPSubSNPLink, Batch \ where SubSNP.subsnp_id = SNPSubSNPLink.subsnp_id and \ SubSNP.batch_id = Batch.batch_id' \ | sort -k1n,1n -k2,2 -u \ > tmp.txt perl -we 'while (<>) { \ chomp; my ($id, $handle) = split("\t"); \ if (defined $prevId && $prevId != $id) { \ print "$prevId\t$handleCount\t$handleBlob\n"; \ $handleCount = 0; $handleBlob = ""; \ } \ $handleCount++; \ $handleBlob .= "$handle,"; \ $prevId = $id; \ } \ print "$prevId\t$handleCount\t$handleBlob\n";' \ tmp.txt > ucscHandles.txt cat > ucscHandles.sql <<EOF CREATE TABLE ucscHandles ( snp_id int NOT NULL, handleCount int unsigned NOT NULL, handles longblob NOT NULL, INDEX snp_id (snp_id) ); EOF hgLoadSqlTab mm10snp138 ucscHandles{,.sql,.txt} # I added 'if (0) then' around the parts of addToDbSnp.csh that completed successfully; # complete the step by running the modified script: # Pop back out of workingDir popd addToDbSnp.csh >>& do.log & tail -f do.log # Now continue with the next step: ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra -continue=bigJoin >>& do.log & tail -f do.log # *** All done! ############################################################################## # FILTER SNP138 (DONE 1/17/14 angie) cd /hive/data/outside/dbSNP/138/mouse zcat snp138.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 3066546 #Common: 8082414 #Flagged: 0 #leftover: 60824824 foreach f ({Mult,Common}.bed.gz) mv $f snp138$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp138$subset -sqlTable=snp138.sql snp138$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (138) (DONE 1/17/14 angie) cd /hive/data/outside/dbSNP/138/mouse # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #1584257 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 372821 3 (coding-synon) # 552828 8 (cds-reference -- ignored) # 376 41 (nonsense) # 181984 42 (missense) # 49 43 (stop-loss) # 3382 44 (frameshift) # 472817 45 (cds-indel) # In b138, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp138CodingDbSnp.bed suggested.lft warn stdin hgLoadBed mm10 snp138CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp138CodingDbSnp.bed #Read 1025678 elements of size 11 from snp138CodingDbSnp.bed ############################################################################## 2013-12-13: import of UCSC GENCODE group processing of GENCODE VM2 (markd) mkdir -p /hive/data/genomes/mm10/bed/gencodeVM2 cd /hive/data/genomes/mm10/bed/gencodeVM2 # create Makefile from previous one. cp /hive/data/genomes/hg19/bed/gencodeV19/Makefile . # download, build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. # NOT DONE THIS TIME, SINCE THIS is the first mouse. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM2 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp human/mm10/wgEncodeGencodeV18.ra human/mm10/wgEncodeGencodeVM2.ra cp human/mm10/wgEncodeGencodeV18.html human/mm10/wgEncodeGencodeVM2.html # edit these plus human/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM2.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous -0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM2", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM2 make joinerCheck # see output in check/joiner.out ############################################################################## # SEGMENTAL DUPLICATIONS (WORKING 4/14/14 Pauline) # File emailed from John Huddleston (jlhudd@uw.edu) in the Eichler Lab. mkdir /hive/data/genomes/mm10/bed/genomicSuperDups cd /hive/data/genomes/mm10/bed/genomicSuperDups wget --timestamping 'http://mouseparalogy.gs.washington.edu/GRCm38/genomicSuperDup.tab' mv genomicSuperDup.tab mm10_WGAC.tab awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm10_WGAC.tab \ | hgLoadBed mm10 genomicSuperDups stdin \ -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql # mm8 version of track had issue where strand values were "+" and "_" -- # checked and found same issue - so ran same fix: hgsql mm10 -e 'update genomicSuperDups set strand = "-" where strand = "_";' #new mm10 version has a lot more stuff than version on mm8: #featureBits mm8 genomicSuperDups #157417547 bases of 2567283971 (6.132%) in intersection #featureBits mm10 genomicSuperDups #214917441 bases of 2652783500 (8.102%) in intersection #select count(*) from genomicSuperDups; #659775 (vs. 277816 in mm8) # ######################################################################### # hgPal downloads (DONE braney 2009-11-03) # FASTA from 60way for refGene, knownGene, knownCanonical ssh hgwdev screen bash rm -rf /cluster/data/mm10/bed/multiz60way/pal mkdir /cluster/data/mm10/bed/multiz60way/pal cd /cluster/data/mm10/bed/multiz60way/pal for i in `cat ../species.list`; do echo $i; done > order.lst mz=multiz60way gp=refGene db=mm10 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.jobs nice time sh -x $gp.jobs > $gp.jobs.log 2>&1 & sleep 1 tail -f $gp.jobs.log # 1817.21user 233.92system 4:54:04elapsed 11%CPU (0avgtext+0avgdata # 920192maxresident)k # 6024inputs+0outputs (7major+1648126minor)pagefaults 0swaps mz=multiz60way gp=refGene db=mm10 zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc # we're only distributing exons at the moment mz=multiz60way gp=refGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz mz=multiz60way gp=knownGene db=mm10 mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -chrom=$j $db $mz $gp order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # oops... missed the timing mz=multiz60way gp=knownGene db=mm10 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz # now do the canonical set cd /cluster/data/mm10/bed/multiz60way/pal mz=multiz60way gp=knownCanonical db=mm10 for j in `awk '{print $1}' /cluster/data/mm10/chrom.sizes` do echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed done mkdir exonAA exonNuc ppredAA ppredNuc for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'` do echo "date" echo "mafGene -geneBeds=$j.known.bed $db $mz knownGene order.lst stdout | \ gzip -c > ppredAA/$j.ppredAA.fa.gz" echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > ppredNuc/$j.ppredNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | \ gzip -c > exonNuc/$j.exonNuc.fa.gz" echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | \ gzip -c > exonAA/$j.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & sleep 1 tail -f $gp.$mz.job.log # real 302m20.489s # user 27m31.179s # sys 5m30.071s rm *.known.bed mz=multiz60way gp=knownCanonical db=mm10 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz rm -rf exonAA exonNuc ppredAA ppredNuc mz=multiz60way gp=knownCanonical db=mm10 pd=/usr/local/apache/htdocs/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ############################################################################## # LASTZ Rhesus rheMac2 (DONE - 2014-05-23 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 cd /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 cat << '_EOF_' > DEF # rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.52/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac2 SEQ2_DIR=/scratch/data/rheMac2/rheMac2.2bit SEQ2_LEN=/scratch/data/rheMac2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs # establish a screen to control this job screen -S mm10RheMac2 time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 & # real 257m21.255s cat fb.mm10.chainRheMac2Link.txt # 895296744 bases of 2652783500 (33.749%) in intersection mkdir /hive/data/genomes/rheMac2/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac2/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac2.2014-05-23/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 & # real 74m36.098s cat fb.rheMac2.chainMm10Link.txt # 875700775 bases of 2646704109 (33.086%) in intersection ############################################################################ # FaceBase Microarray track (DONE - 2014-05-21 - Pauline) # establish a screen to control this job with a name to indicate what it is mkdir /hive/data/genomes/mm10/bed/FaceBase24SampleTypesAvg cd /hive/data/genomes/mm10/bed/FaceBase24SampleTypesAvg wget --timestamping http://genomebrowser.facebase.org/myHub/mm10/FaceBase_24Samp_Types_Averaged.bed hgLoadBed mm10 FaceBase24SampleTypesAvg FaceBase_24Samp_Types_Averaged.bed #For microarray tracks also need to add a section to #/cluster/home/pauline/kent/src/hg/makeDb/hgCgiData/Mouse/microarrayGroups.ra ############################################################################## # RepeatMasker Visualization track (DONE - 2014-07-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/rmskJoined cd /hive/data/genomes/mm10/bed/rmskJoined ln -s ../repeatMasker/mm10.sorted.fa.out . ln -s ../repeatMasker/mm10.fa.align.gz . # working on fixing this script for the next release of RM # since mm10 was an older version of RM, this conversion needs the # bedtools, thus the extra PATH business export PATH=/cluster/bin/bedtools:$PATH /scratch/data/RepeatMasker140131/util/nextVerRmToUCSCTables.pl \ -out mm10.sorted.fa.out -align mm10.fa.align.gz hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ -renameSqlTable -verbose=4 -tab \ -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as mm10 \ rmskJoinedBaseline mm10.sorted.fa.join.bed \ > loadJoined.log 2>&1 hgLoadSqlTab mm10 rmskAlignBaseline \ /cluster/home/hiram/kent/src/hg/lib/rmskAlign.sql \ mm10.fa.align.tsv > loadAlign.log 2>&1 hgLoadOutJoined -verbose=2 mm10 mm10.sorted.fa.out > loadOut.log 2>&1 featureBits -countGaps mm10 rmskJoinedBaseline # 2243474717 bases of 2730871774 (82.152%) in intersection ############################################################################## # cloneEnds (DONE - 2014-08-11 - Steve) mkdir /hive/data/genomes/mm10/bed/cloneEnds cd /hive/data/genomes/mm10/bed/cloneEnds # fetch the NCBI INSDC name correspondence file: rsync -a -P rsync://ftp.ncbi.nlm.nih.gov/genomes/ASSEMBLY_REPORTS/All/GCF_000001635.23.assembly.txt ./ # fetch the clone reports mkdir reports rsync -a -P \ rsync://ftp.ncbi.nih.gov/repository/clone/reports/Mus_musculus/*.GCF_000001635.22.103.*.gff \ ./reports/ # script to establish refSeq to UCSC chrom names: cat << '_EOF_' > refSeqNames.pl #!/usr/bin/env perl use strict; use warnings; open (FH, "<GCF_000001635.23.assembly.txt") or die "can not read GCF_000001635.23.assembly.txt"; while (my $line = <FH>) { chomp $line; next if ($line =~ m/^#/); my @a = split('\t', $line); my $chrN = $a[2]; my $refSeq = $a[6]; my $contig = $a[4]; my $type = $a[1]; next if (!defined $type); next if (!defined $refSeq); next if (!defined $contig); my $suffix = ""; if ($type eq "alt-scaffold") { $suffix = "_alt"; } elsif ($type eq "unlocalized-scaffold") { $suffix = "_random"; } elsif ($type eq "unplaced-scaffold") { $chrN = "Un"; } $chrN = "M" if ($chrN eq "MT"); if ($a[0] =~ m/_/) { $contig =~ s/\.[0-9]//; printf "%s\tchr%s_%s%s\n", $refSeq, $chrN, $contig, $suffix; } else { printf "%s\tchr%s\n", $refSeq, $chrN; } } close (FH); '_EOF_' # << happy emacs chmod +x refSeqNames.pl ./refSeqNames.pl > refSeq.ucscName.tab # establish full library list: ls reports/*.GCF_000001635.22.103.*.gff | sed -e 's#reports/##' \ | cut -d"." -f1 | sort -u > library.list.txt # a script to scan the GFF files, with the refSeq.ucscName.tab # name correspondence to construct bed files cat << '_EOF_' > mm10.pl #!/usr/bin/env perl use strict; use warnings; my $argc = scalar(@ARGV); if ($argc < 1) { printf STDERR "usage: ./mm10.pl <report.gff> [moreReports.gff]\n"; exit 255; } my %refSeqToUcsc; # key is refSeq name, value is UCSC chrom name open (FH, "<refSeq.ucscName.tab") or die "can not read refSeq.ucscName.tab"; while (my $line = <FH>) { chomp $line; my ($refSeq, $ucsc) = split('\t', $line); $refSeqToUcsc{$refSeq} = $ucsc; } close (FH); my %chromSizes; # key is UCSC chrom name, key is chrom size open (FH, "</hive/data/genomes/mm10/chrom.sizes") or die "can not read mm10/chrom.sizes"; while (my $line = <FH>) { chomp $line; my ($chr, $size) = split('\t', $line); $chromSizes{$chr} = $size; } close (FH); while (my $file = shift) { my %starts; # key is parent ID, value is start end coordinates start,end my %ends; # key is parent ID, value is end end coordinates start,end my %parents; # key is parent ID, value is 1 to signify exists my %endNames; # key is parent ID, value is the Name of the parent clone_insert printf STDERR "# processing $file\n"; open (FH, "<$file") or die "can not read $file"; while (my $line = <FH>) { chomp $line; next if ($line=~ m/^#/); my @a = split('\t', $line); next if (scalar(@a) < 1); my $contig = $a[0]; $contig =~ s/ref.//; $contig =~ s/\|//; my $ucscChr = $refSeqToUcsc{$contig}; if (!defined($ucscChr)) { printf STDERR "# ERR: contig not in refSeqToUcsc: '$contig'\n"; next; } next if (! exists($chromSizes{$ucscChr})); my $chromSize = $chromSizes{$ucscChr}; my $chromStart = $a[3] - 1; my $chromEnd = $a[4]; if ($chromStart > $chromSize) { printf STDERR "# warning chromStart over size $ucscChr $chromStart $chromEnd\n"; $chromStart = $chromSize-1; } if ($chromEnd > $chromSize) { my $overRun = $chromEnd - $chromSize; printf STDERR "# warning chromEnd over size by $overRun -> $ucscChr $chromStart $chromEnd\n"; $chromEnd = $chromSize; } my $id="notFound"; my $name="notFound"; my $parent="notFound"; my @b = split(';', $a[8]); for (my $i = 0; $i < scalar(@b); ++$i) { my ($tag, $value) = split('=', $b[$i]); if ($tag eq "ID") { $id = $value; if ($id !~ m/-/) { if (exists($parents{$id})) { printf STDERR "# WARN: duplicate parent: $id"; } else { $parents{$id} = $ucscChr; } } } elsif ($tag eq "Parent") { $parent = $value; } elsif ($tag eq "Name") { $name = $value; } } my $type="notFound"; my $insertType = $a[2]; if ($insertType =~ m/clone_insert_start/) { $type = "start"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for start $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: start found $name with no parent $parent declared\n"; } elsif (exists($starts{$parent})) { printf STDERR "# ERR: duplicate start for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $starts{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: start on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert_end/) { $type = "end"; if ($parent eq "notFound") { printf STDERR "# ERR: can not find parent for end $name Ttype $id\n"; } else { if (!exists($parents{$parent})) { printf STDERR "# ERR: end found $name with no parent $parent declared\n"; } elsif (exists($ends{$parent})) { printf STDERR "# ERR: duplicate end for $parent\n"; } elsif ($ucscChr eq $parents{$parent}) { $ends{$parent} = sprintf("%s\t%s", $chromStart, $chromEnd); } else { printf STDERR "# ERR: end on different chrom $ucscChr than parent $parent $parents{$parent}\n"; } } } elsif ($insertType =~ m/clone_insert/) { $type = "insert"; $endNames{$id} = $name; } $name =~ s/gi\|//g; $id =~ s/gi\|//g; printf STDERR "%s\t%d\t%d\t%s_%s_%s\t0\t%s\n", $ucscChr, $chromStart, $chromEnd, $name, $type, $id, $a[6]; } # while (my $line = <FH>) close (FH); foreach my $parent (keys %parents) { if (! exists($starts{$parent}) ) { printf STDERR "# ERR: no start for $parent\n"; } elsif (! exists($ends{$parent}) ) { printf STDERR "# ERR: no end for $parent\n"; } else { my $strand = "+"; my $chrStart = 0; my $chrEnd = 0; my $blockStart = 0; my ($sStart, $sEnd) = split('\t', $starts{$parent}); my ($eStart, $eEnd) = split('\t', $ends{$parent}); my $startSize = $sEnd - $sStart; my $endSize = $eEnd - $eStart; if ($eStart < $sStart) { $chrStart = $eStart; $chrEnd = $sEnd; $blockStart = $sStart - $chrStart; $strand = "-"; $startSize = $eEnd - $eStart; $endSize = $sEnd - $sStart; } else { $chrStart = $sStart; $chrEnd = $eEnd; $blockStart = $eStart - $chrStart; } if ($startSize > $blockStart) { printf STDERR "# startSize > blockStart $endNames{$parent}\n"; } else { printf "%s\t%d\t%d\t%s\t0\t%s\t%d\t%d\t0\t2\t%d,%d\t0,%d\n", $parents{$parent}, $chrStart, $chrEnd, $endNames{$parent}, $strand, $chrStart, $chrEnd, $startSize, $endSize, $blockStart; } } } } '_EOF_' # << happy emacs chmod +x mm10.pl # process GFF files into bed files into separateLibs/ directory for L in `cat library.list.txt` do export db="`pwd -P | awk -F'/' '{print $5}'`" export destDir="separateLibs/${L}" echo "working: ${L}" mkdir -p "${destDir}" ./${db}.pl reports/${L}.GCF_000001635.22.103.*.gff \ 2> ${destDir}/tmp.bed6 | sort -k1,1 -k2,2n > ${destDir}/${db}.${L}.bed sort -k1,1 -k2,2n ${destDir}/tmp.bed6 > ${destDir}/${db}.${L}.items.bed6 done # use only those libraries with more than 20,000 clone ends wc -l separateLibs/*/*.bed | sort -n | grep -v total | awk '$1 > 20000' \ | sed -e 's#.*separateLibs/##; s#/.*##' > libs.over20K.list # note those libraries with less than 20,000 clone ends wc -l separateLibs/*/*.bed | grep -v total | awk '$1 < 20000' | sed -e 's#.*separateLibs/##; s#/.*##' > libs.under20K.list # filter out bad ends, length must be <= median size times three cat lis.over20K.list | while read L do if [ ! -s separateLibs/${L}/lengths.txt ]; then awk '{print $3-$2}' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/lengths.txt fi median3X=`ave separateLibs/${L}/lengths.txt | grep median | awk '{printf "%d", $2*3}'` awk '($3-$2) < '$median3X'' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/mm10.median3X.bed awk '($3-$2) >= '$median3X'' separateLibs/${L}/mm10.${L}.bed > separateLibs/${L}/mm10.badMap.bed before=`cat separateLibs/${L}/mm10.${L}.bed | wc -l` after=`cat separateLibs/${L}/mm10.median3X.bed | wc -l` dropped=`echo $before $after | awk '{print $1-$2}'` perCent=`echo $dropped $before | awk '{printf "%.2f", 100*'$dropped/$before'}'` echo "$L $before - $after = $dropped -> % $perCent dropped" done # B6Ng01 96548 - 95837 = 711 -> % 0.74 dropped # C3H 42705 - 42378 = 327 -> % 0.77 dropped # CH29 51200 - 50621 = 579 -> % 1.13 dropped # DN 101826 - 100472 = 1354 -> % 1.33 dropped # MHPN 59859 - 58582 = 1277 -> % 2.13 dropped # MHPP 29074 - 28550 = 524 -> % 1.80 dropped # MSMg01 81802 - 78772 = 3030 -> % 3.70 dropped # RP23 83424 - 83062 = 362 -> % 0.43 dropped # RP24 51112 - 50849 = 263 -> % 0.51 dropped # WI1 326662 - 324259 = 2403 -> % 0.74 dropped # bMQ 73519 - 72540 = 979 -> % 1.33 dropped # loading the median3X files mkdir -p filteredEnds for L in `cat libs.over20K.list` do echo $L 1>&2 hgLoadBed -type=bed12 mm10 cloneEnd_${L} \ separateLibs/${L}/mm10.median3X.bed \ > filteredEnds/loadBed.${L}.log 2>&1 done # construct multiple mapped ends: cat separateLibs/*/mm10.median3X.bed | cut -f4 | sort | uniq -c | sort -rn > allEnds.names.count.txt awk '$1 > 1' allEnds.names.count.txt | awk '{print $2}' | sort > multiples.names.txt cat separateLibs/*/mm10.median3X.bed | sort -k4 > allEnds.nameSorted.bed join -t' ' -o "2.1,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9,2.10,2.11,2.12" -2 4 multiples.names.txt allEnds.nameSorted.bed | sort -k1,1 -k2,2n > allEnds.multiple.locations.bed hgLoadBed -type=bed12 mm10 cloneEnd_multipleMaps \ allEnds.multiple.locations.bed > load.multipleMaps.log 2>&1 # construct bad mapped ends: mkdir -p filteredDroppedEnds for L in `cat libs.over20K.list` do echo $L 1>&2 cat separateLibs/${L}/mm10.badMap.bed done | sort -k1,1 -k2,2n > filteredDroppedEnds/badEnds.bed hgLoadBed -type=bed12 mm10 cloneEndbadEnds filteredDroppedEnds/badEnds.bed \ > filteredDroppedEnds/loadBed.badEnds.log 2>&1 # construct coverage bigWig files: cat separateLibs/*/mm10.median3X.bed | awk '$6 == "+"' | sort -k1,1 -k2,2n \ | bedItemOverlapCount mm10 stdin > allEnds.forward.bedGraph cat separateLibs/*/mm10.median3X.bed | awk '$6 == "-"' | sort -k1,1 -k2,2n \ | bedItemOverlapCount mm10 stdin > allEnds.reverse.bedGraph bedGraphToBigWig allEnds.forward.bedGraph /hive/data/genomes/mm10/chrom.sizes \ cloneEnd_coverageForward.bw bedGraphToBigWig allEnds.reverse.bedGraph /hive/data/genomes/mm10/chrom.sizes \ cloneEnd_coverageReverse.bw mkdir /gbdb/mm10/bbi/cloneEnd ln -s `pwd`/cloneEnd_coverageForward.bw /gbdb/mm10/bbi/cloneEnd ln -s `pwd`/cloneEnd_coverageReverse.bw /gbdb/mm10/bbi/cloneEnd hgBbiDbLink mm10 cloneEnd_coverageForward \ /gbdb/mm10/bbi/cloneEnd/cloneEnd_coverageForward.bw hgBbiDbLink mm10 cloneEnd_coverageReverse \ /gbdb/mm10/bbi/cloneEnd/cloneEnd_coverageReverse.bw ### Fixup the scores to indicate how many multiple mappings as mentioned ### in the hg19 bacEnds description page: one mapping: score = 1000 ### multiple mappings: score = 1500/count ### the sort | uniq -c | awk does this score calculation with the name ### in column 1 ### The join puts the existing table together with those scores ### DONE - 2016-03-02 - Hiram mkdir /hive/data/genomes/mm10/bed/cloneEnds/addCounts cd /hive/data/genomes/mm10/bed/cloneEnds/addCounts mkdir score withScore noScore withScore for table in cloneEndB6Ng01 cloneEndC3H cloneEndCH29 cloneEndDN \ cloneEndMHPN cloneEndMHPP cloneEndMSMg01 cloneEndRP23 cloneEndRP24 \ cloneEndWI1 cloneEndbMQ cloneEndbadEnds cloneEndmultipleMaps do hgsql -N -e "select name from $table;" mm10 | sort | uniq -c | awk '{ if (1 == $1) {printf "%s\t1000\n", $2} else {printf "%s\t%d\n", $2, 1500/$1} }' \ | sort > score/mm10.$table.score.tab hgsql -N -e "select * from $table order by name;" mm10 \ | sort -k5 > noScore/mm10.$table.tab join -t' ' -1 5 noScore/mm10.$table.tab score/mm10.$table.score.tab \ | awk '{printf "%d\t%s\t%d\t%d\t%s\t%d\t%s\t%d\t%d\t%d\t%d\t%s\t%s\n", $2,$3,$4,$5,$1,$14,$7,$8,$9,$10,$11,$12,$13}' \ | sort -k2,2 -k3,3n > withScore/mm10.$table.withScore.tab hgsql -e "delete from $table;" mm10 hgsql -e "load data local infile \"withScore/mm10.$table.withScore.tab\" into table $table;" mm10 done for table in cloneEndB6Ng01 cloneEndC3H cloneEndCH29 cloneEndDN \ cloneEndMHPN cloneEndMHPP cloneEndMSMg01 cloneEndRP23 cloneEndRP24 \ cloneEndWI1 cloneEndbMQ cloneEndbadEnds cloneEndmultipleMaps do hgsql -N -e "select count(*) from $table;" mm10 | cat done # 95837 # 42378 # 50621 # 100472 # 58582 # 28550 # 78772 # 83062 # 50849 # 324259 # 72540 # 11809 # 4269 ############################################################################## # 2014-08-17: import of UCSC GENCODE group processing of GENCODE VM3 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM3/data cd /hive/data/genomes/mm10/bed/gencodeVM3 # download gencode release cd data wget -nv -r -np ftp://ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M3 mv ftp.sanger.ac.uk/pub/gencode/Gencode_mouse/release_M3 . rm -rf ftp.sanger.ac.uk cd .. # create Makefile from previous one. cp /hive/data/genomes/mm10/bed/gencodeVM2/Makefile . # build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM3 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM2.ra mouse/mm10/wgEncodeGencodeVM3.ra cp mouse/mm10/wgEncodeGencodeVM2.html mouse/mm10/wgEncodeGencodeVM3.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM3.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM3", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode VM?' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM3 make joinerCheck # output in check/joiner.out ############################################################################## # LASTZ Cow bosTau8 (DONE - 2014-10-15 - Steve) mkdir /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 cd /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 cat << '_EOF_' > DEF # mouse vs cow # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/nib SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: Cow bosTau8 SEQ2_DIR=/hive/data/genomes/bosTau8/bosTau8.2bit SEQ2_LEN=/hive/data/genomes/bosTau8/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15 TMPDIR=/scratch/tmp '_EOF_' # << happy emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 181m30.700s cat fb.mm10.chainBosTau8Link.txt # 698722925 bases of 2652783500 (26.339%) in intersection # Create link cd /hive/data/genomes/mm10/bed ln -s lastzBosTau8.2014-10-15 lastz.bosTau8 # and the swap mkdir /hive/data/genomes/bosTau8/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau8/bed/blastz.mm10.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau8.2014-10-15/DEF \ -swap -syntenicNet \ -noLoadChainSplit \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 58m4.272s cat fb.bosTau8.chainMm10Link.txt # 687270584 bases of 2649307237 (25.942%) in intersection # Create link cd /hive/data/genomes/bosTau8/bed ln -s blastz.mm10.swap lastz.mm10 ############################################################################ # 2014-12-05: import of UCSC GENCODE group processing of GENCODE VM4 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM4/data cd /hive/data/genomes/mm10/bed/gencodeVM4 # create Makefile from previous one. # WARNING: next build start with hg/makeDb/outside/gencode/gencodeLoad.mk cp /hive/data/genomes/hg38/bed/gencodeV21/Makefile . # download, build and load tables (time nice make -j 10) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM4 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM2.ra mouse/mm10/wgEncodeGencodeVM4.ra cp mouse/mm10/wgEncodeGencodeVM2.html mouse/mm10/wgEncodeGencodeVM4.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM4.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. ### IMPORTANT: make sure that hgTracks/gencodeTracks.c registers ### track handler for this version of gencode: registerTrackHandler("wgEncodeGencodeVM4", gencodeGeneMethods); # update all.joiner and validate # look for the last section `begin Gencode VM?' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed cd /hive/data/genomes/mm10/bed/gencodeVM4 make joinerCheck # output in check/joiner.out ############################################################################## ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## # LASTZ mouse/mm10 sheep/oviAri3 - (DONE - 2015-01-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 cd /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 cp -p \ /hive/users/hiram/multiz/100way/mm10.oviAri3/mm10.oviAri3.tuning.top400.txt \ ./mm10.oviAri3.tuning.Q.txt cat << '_EOF_' > DEF # mouse vs sheep # parameters obtained from a tuning run of lastz_D # /hive/users/hiram/multiz/100way/mm10.oviAri3/mm10.oviAri3.tuning.top400.txt BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz BLASTZ_T=2 BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=254 BLASTZ_X=890 BLASTZ_Y=3400 BLASTZ_Q=/hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08/mm10.oviAri3.tuning.Q.txt # A C G T # A 89 -172 -40 -184 # C -172 100 -121 -40 # G -40 -121 100 -172 # T -184 -40 -172 89 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: sheep oviAri3 SEQ2_DIR=/hive/data/genomes/oviAri3/oviAri3.2bit SEQ2_LEN=/hive/data/genomes/oviAri3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 75m27.412s cat fb.mm10.chainOviAri3Link.txt # 432006690 bases of 2652783500 (16.285%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 oviAri3) > rbest.log 2>&1 & # real 17m24.577s # and for the swap: mkdir /hive/data/genomes/oviAri3/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri3.2015-01-08/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 31m27.481s cat fb.oviAri3.chainMm10Link.txt #422549165 bases of 2534335866 (16.673%) in intersection time (doRecipBest.pl -buildDir=`pwd` oviAri3 mm10) > rbest.log 2>&1 # real 16m45.956s ######################################################################### # RETROFINDER RETROPOSED GENES ucscRetro track VERSION 6 # (2015-01-02 - 2015-01-07, hartera, DONE) ssh hgwdev mkdir -p /hive/hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 cat << '_EOF_' > DEF RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 " RUNDATE="2015-01-02" DB=mm10 SCORETHRESH=510 GENOMENAME='Mus musculus' GBDB=mm DATE=20150102 VERSION=6 RUNDIR=/hive/groups/gencode/pseudogenes/retroFinder/$DB.$DATE BINDIR=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/bin KENTDIR=/cluster/home/hartera/kent KENTBINDIR=/cluster/bin/x86_64 MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz.$VERSION TMPMRNA=$RUNDIR/mrnaBlastz/$DB TMPEST=$RUNDIR/est/$DB USEALTSEQS=0 EST=all_est SPLICED_EST=intronEst SPLIT_EST=0 SPLIT_SPLICED_EST=1 LASTZPROG=/cluster/bin/penn/x86_64/lastz SCRIPT=/hive/users/hartera/GencodeWG/retroFinder/branches/version2/src/pipeline GENOME=/hive/data/genomes TWOBIT=$GENOME/$DB/$DB.2bit RETRODIR=$GENOME/$DB/bed/retro BASE=$RUNDIR/retro BASE=/hive/groups/gencode/pseudogenes/retroFinder/mm10.${DATE}/retro OUTDIR=${BASE}/version${VERSION}/${DB} RESULT=$OUTDIR/result RESULTSPLIT=$OUTDIR/resultSplit LOG=$OUTDIR/log OUT=$OUTDIR/out OVERLAPDIR=$OUTDIR/run.o TABLE=ucscRetroInfo$VERSION ORTHOTABLE=ucscRetroOrtho$VERSION ALIGN=ucscRetroAli$VERSION LOCAL=/scratch/data/$DB NIB=$LOCAL/nib RMSK=rmsk NET1=netHg38 NET2=netCanFam3 NET3=netRn5 GENE1=knownGene GENE2=refGene GENE3=wgEncodeGencodeCompVM4 CLUSTER=ku SPECIES="hg38 mm10" ROOTDIR="~/public_html/retro/mm10Jul14" WEBROOT=$ROOTDIR/retro.$RUNDATE WEBSERVER=http://hgwdev-hartera.soe.ucsc.edu SHUFFLEDIR=shuffle SHUFFLEROOT=$WEBROOT/$SHUFFLEDIR DUPDIR=dups DUPROOT=$WEBROOT/$DUPDIR AGEDIR=age AGEROOT=$WEBROOT/$AGEDIR EXPDIR=exp GENEPFAM=knownGene PFAM=knownToPfam PFAMIDFIELD=name PFAMDOMAIN=value ARRAY=gnfAtlas2 AFFYPROBE=affyGnf1m ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio ARRAYABS=hgFixed.gnfMouseAtlas2All ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps # ARRAYLOOKUP=knownToGnfAtlas2 #ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl" ALTSPLICE=sibTxGraph SPLITBYAGE=$SCRIPT/splitRetrosByAgeMouse PDB=proteins140122 BREAKS=0,8,16,24,32 XLIM=34 YLIM=0.1 YLIM1=4000 YLIM2=160 MAXDIVERGENCE=32 '_EOF_' # << happy emacs chmod +x DEF mkdir -p /hive/data/genomes/mm10/bed/retro mkdir -p /hive/data/genomes/mm10/bed/mrnaBlastz.6 cd /hive/data/genomes/mm10/bed/mrnaBlastz.6 # Create S1.len file foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y) echo $c hgsql -Ne "select chrom, size from chromInfo where chrom='chr${c}';" mm10 \ >> S1.len end # NOTE: in future, use /hive/data/genomes/mm10/chrom.sizes for S1.len # and just remove randoms and chrM. cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102 mkdir mrnaBlastz cd mrnaBlastz cp ../DEF . cp /hive/data/genomes/mm10/bed/mrnaBlastz.6/S1.len . screen # Run steps 1 to 5 of RetroFinder pipeline from scripts in CCDS SVN source tree: retroFinder/branches/version2/src/pipeline/ucscStep1.sh DEF # check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep2.sh DEF retroFinder/branches/version2/src/pipeline/ucscStep3.sh DEF #check cluster jobs on ku retroFinder/branches/version2/src/pipeline/ucscStep4.sh DEF #check cluster jobs on ku # Load the track retroFinder/branches/version2/src/pipeline/ucscStep5.sh DEF cd /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102/retro/version6/mm10 retroFinder/branches/version2/src/pipeline/filterMrna.sh retroFinder/branches/version2/src/pipeline/filterEst.sh # Check cluster jobs on ku retroFinder/branches/version2/src/pipeline/analyseExpress.sh # Check cluster jobs on ku #added ucscRetroAli6 to kent/src/hg/makeDb/mouse/mm10/trackDb.ra # copied # /hive/groups/gencode/pseudogenes/retroFinder/mm10.20150102/retro/version6/mm10/trackDb.retro # entry to kent/src/hg/makeDb/trackDb/mouse/mm10/trackDb.ra # and edited it to remove the full data and add: # dataVersion Jan. 2015 # Scripts copied ucscRetroAli6.psl, ucscRetroInfo6.bed and ucscRetroCds6.tab # to /hive/data/genomes/mm10/bed/retro/ ############################################################################## # LASTZ mouse/mm10 sheep/tarSyr2 - (DONE - 2015-03-27 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 cd /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 cat << '_EOF_' > DEF # tarsier vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Tarsier TarSyr2 SEQ2_DIR=/hive/data/genomes/tarSyr2/tarSyr2.2bit SEQ2_LEN=/hive/data/genomes/tarSyr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27 TMPDIR=/dev/shm '_EOF_' # << happy emacs time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 301m17.238s time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -continue=syntenicNet -syntenicNet -workhorse=hgwdev \ -smallClusterHub=ku -bigClusterHub=ku) > synNet.log 2>&1 # real 16m5.061s cat fb.mm10.chainTarSyr2Link.txt # 856877439 bases of 2652783500 (32.301%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 tarSyr2) > rbest.log 2>&1 & # real 27m4.048s # and for the swap: mkdir /hive/data/genomes/tarSyr2/bed/blastz.mm10.swap cd /hive/data/genomes/tarSyr2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTarSyr2.2015-03-27/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 181m7.042s cat fb.tarSyr2.chainMm10Link.txt # 900229088 bases of 3405755564 (26.433%) in intersection time (doRecipBest.pl -buildDir=`pwd` tarSyr2 mm10) > rbest.log 2>&1 # real 77m29.742s ######################################################################### # UCSC to RefSeq name correspondence (DONE - 2015-04-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/ucscToRefSeq cd /hive/data/genomes/mm10/bed/ucscToRefSeq rsync -avPL \ rsync://ftp.ncbi.nlm.nih.gov/genomes/genbank/vertebrate_mammalian/Mus_musculus/all_assembly_versions/GCA_000001635.5_GRCm38.p3/GCA_000001635.5_GRCm38.p3_assembly_report.txt ./ # this assembly_report has "UCSC-style-name" in column 10 # but it does not name everything # columns 5 and 7 are the INSDC and RefSeq names grep -v "^#" GCA_000001635.5_GRCm38.p3_assembly_report.txt \ | awk -F'\t' '{printf "%s\t%s\n", $5,$7}' | sort > insdc.refSeq.tab # chrM/MT confusion fixed by sed hgsql -N -e 'select name,chrom,chromStart,chromEnd from ucscToINSDC;' mm10 \ | sed -e 's/NC_005089.1/AY172335.1/;' | sort > insdc.ucsc.tab join insdc.ucsc.tab insdc.refSeq.tab | tr '[ ]' '[\t]' \ | cut -f2- > ucsc.refSeq.tab export chrSize=`cut -f1 ucsc.refSeq.tab | awk '{print length($0)}' | sort -n | tail -1` sed -e "s/21/$chrSize/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | sed -e 's/INSDC/RefSeq/g;' > ucscToRefSeq.sql hgLoadSqlTab mm10 ucscToRefSeq ./ucscToRefSeq.sql ucsc.refSeq.tab checkTableCoords mm10 -table=ucscToRefSeq featureBits -countGaps mm10 ucscToRefSeq # 2730871774 bases of 2730871774 (100.000%) in intersection # fixup 2016-04-11 - Hiram # the chrM name is not correct, it was RefSeq instead of Genbank/INSDC: hgsql -e 'select * from ucscToINSDC where name="NC_005089.1";' mm10 +-------+------------+----------+-------------+ | chrom | chromStart | chromEnd | name | +-------+------------+----------+-------------+ | chrM | 0 | 16299 | NC_005089.1 | +-------+------------+----------+-------------+ hgsql -e 'update ucscToINSDC set name="AY172335.1" where name="NC_005089.1";' mm10 hgsql -e 'select * from ucscToINSDC where name="AY172335.1";' mm10 +-------+------------+----------+------------+ | chrom | chromStart | chromEnd | name | +-------+------------+----------+------------+ | chrM | 0 | 16299 | AY172335.1 | +-------+------------+----------+-------------+ ######################################################################### # download and load ncbiGene track ( DONE - 2015-06-09 - Brian) db=mm10 mkdir /cluster/data/genomes/$db/bed/ncbiGene cd /cluster/data/genomes/$db/bed/ncbiGene ftpFile=ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus/GFF/ref_GRCm38.p3_top_level.gff3.gz gff3File=`basename $ftpFile` echo "select * from ucscToRefSeq" | hgsql $db | tail -n +2 | awk '{print 0, $4, $3, $1, $3}' > refSeqToUcsc.lft rm -f $ftpFile wget $ftpFile /cluster/home/braney/bin/x86_64/gff3ToGenePred -useName -warnAndContinue -attrsOut=attrs -bad=bad.gp $gff3File stdout 2> convertErr.txt | liftUp -type=.gp -extGenePred lift.gp refSeqToUcsc.lft warn stdin 2> liftErr.txt wc -l lift.gp # 108567 lift.gp wc -l bad.gp # 189 tawk '{print $1}' attrs | sort | uniq > meta wc -l meta # 110847 meta for i in product Dbxref gene gbkey do echo $i tawk -v attr=$i '$2==attr {print $1,$3}' attrs | sort | uniq | join -t $'\t' /dev/stdin meta > out mv out meta done wc -l meta # 109420 meta egrep "^N(M|R|P)" lift.gp > curated.gp egrep "^X(M|R)" lift.gp > predicted.gp wc -l curated.gp predicted.gp #33545 curated.gp #70587 predicted.gp #104132 total cat curated.gp predicted.gp | awk '{print $1}' | sort -u > tmp1 cat meta | awk '{print $1}' | sort -u > tmp2 join -v 1 tmp1 tmp2 | wc -l # 0 grep dropping convertErr.txt | wc -l # 189 awk '/isn/ {print $1}' liftErr.txt | sort -u # NT_166322.1 # NT_187001.1 hgLoadGenePred -genePredExt $db ncbiRefCurated curated.gp hgLoadGenePred -genePredExt $db ncbiRefPredicted predicted.gp hgLoadSqlTab $db ncbiRefLink $kent/src/hg/lib/ncbiRefLink.sql meta hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("mm10", "ncbiRefSeq", "braney", "105", now(), \ "http://www.ncbi.nlm.nih.gov/genome/annotation_euk/Mus_musculus/105/", \ "ftp://ftp.ncbi.nlm.nih.gov/genomes/M_musculus", \ "9 February 2015" );' hgFixed # ############################################################################# # hgPal downloads (DONE braney 2015-06-02) # CDS FASTA from 60-way for knownGene ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal cd /hive/data/genomes/mm10/bed/multiz60way/pal cat ../species.list | tr '[ ]' '[\n]' > order.lst export mz=multiz60way export gp=knownGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.lst stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.lst stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time nice sh -x $gp.jobs > $gp.jobs.log 2>&1 & # real 80m36.763s mz=multiz60way gp=knownGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m16.821s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ # ############################################################################# # hgPal downloads (DONE jcasper 2016-06-22) # CDS FASTA from 60-way for knownGene - rebuilt for mm10 ucsc genes v16 ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc16 cd /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc16 cat ../species.list | tr '[ ]' '[\n]' > order.lst export mz=multiz60way export gp=knownGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.lst stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.lst stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time nice sh -x $gp.jobs > $gp.jobs.log 2>&1 # real 87m59.962s mz=multiz60way gp=knownGene db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m48.725s zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz rm -rf exonAA exonNuc # we're only distributing exons at the moment mz=multiz60way gp=knownGene db=mm10 pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd rm -f $pd/$gp.exonAA.fa.gz $pd/$gp.exonNuc.fa.gz $pd/md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz (cd $pd && md5sum *.fa.gz) > md5sum.txt ln -s `pwd`/md5sum.txt $pd/ ########################################################################### # GENEID GENE PREDICTIONS (DONE - 2015-06-26 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/geneid cd /hive/data/genomes/mm10/bed/geneid wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/00README wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/mm10.geneid.prot wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/geneid_v1.4/mm10.geneid.gtf ldHgGene -gtf -genePredExt mm10 geneid mm10.geneid.gtf # Read 36771 transcripts in 287332 lines in 1 files # 36771 groups 66 seqs 1 sources 3 feature types # 36771 gene predictions featureBits -enrichment mm10 refGene:CDS geneid # refGene:CDS 1.292%, geneid 1.584%, both 1.028%, cover 79.51%, enrich 50.19x featureBits -enrichment mm9 refGene:CDS geneid # refGene:CDS 1.305%, geneid 1.590%, both 1.040%, cover 79.65%, enrich 50.11x featureBits -countGaps mm10 geneid # 42028722 bases of 2730871774 (1.539%) in intersection featureBits -countGaps mm9 geneid # 41651898 bases of 2725765481 (1.528%) in intersection ########################################################################## # SGP GENES (DONE - 2015-07-30 - Hiram) mkdir /hive/data/genomes/mm10/bed/sgpGene cd /hive/data/genomes/mm10/bed/sgpGene wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/00README wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/mm10.sgp2.gtf wget --timestamping \ http://genome.crg.es/genepredictions/M.musculus/mm10/SGP2/hg38/mm10.sgp2.gff3 ldHgGene -gtf -genePredExt mm10 sgpGene mm10.sgp2.gtf # Read 35235 transcripts in 287314 lines in 1 files # 35235 groups 60 seqs 1 sources 3 feature types # 35235 gene predictions featureBits -enrichment mm10 refGene:CDS sgpGene # refGene:CDS 1.292%, sgpGene 1.430%, both 1.101%, cover 85.21%, enrich 59.59x featureBits -enrichment mm9 refGene:CDS sgpGene # refGene:CDS 1.305%, sgpGene 1.439%, both 1.113%, cover 85.23%, enrich 59.23x ######################################################################### 2015-06-29-13: import of UCSC GENCODE group processing of GENCODE VM5 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM5 cd /hive/data/genomes/mm10/bed/gencodeVM5 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set # release and transcript support versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. ## Important to make sure filter attrs.transcriptType matches current set ## figured out with select distinct transcriptType from wgEncodeGencodeAttrsVM5 order by transcriptType; cd kent/src/hg/makeDb/trackDb cp human/mm10/wgEncodeGencodeV18.ra human/mm10/wgEncodeGencodeVM5.ra cp human/mm10/wgEncodeGencodeV18.html human/mm10/wgEncodeGencodeVM5.html # edit these plus human/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM5.ra in reverse order with previous # tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM5 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # lastz zebrafish danRer10 (DONE - 2015-09-11 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10DanRer10 mkdir /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 cd /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 cat << '_EOF_' > DEF # Mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: zebrafish danRer10 SEQ2_DIR=/hive/data/genomes/danRer10/danRer10.2bit SEQ2_LEN=/hive/data/genomes/danRer10/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11 TMPDIR=/dev/shm '_EOF_' # << happy emacs # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 198m3.073s cat fb.mm10.chainDanRer10Link.txt # 73464192 bases of 2652783500 (2.769%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 danRer10) > rbest.log 2>&1 & # real 7m8.599s # and for the swap mkdir /hive/data/genomes/danRer10/bed/blastz.mm10.swap cd /hive/data/genomes/danRer10/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer10.2015-09-11/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 16m8.387s cat fb.danRer10.chainMm10Link.txt # 71611488 bases of 1369683683 (5.228%) in intersection time (doRecipBest.pl -buildDir=`pwd` danRer10 mm10) > rbest.log 2>&1 # real 7m34.259s ######################################################################### 2015-10-02: import of UCSC GENCODE group processing of GENCODE VM7 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM7 cd /hive/data/genomes/mm10/bed/gencodeVM7 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set # release and transcript support versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructiuons in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM4.ra mouse/mm10/wgEncodeGencodeVM7.ra cp mouse/mm10/wgEncodeGencodeVM4.html mouse/mm10/wgEncodeGencodeVM7.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM7.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM7 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # DBSNP 142 / SNP142 (DONE 2015-11-20 braney) # RedMine #15934 screen -S mm10dbSnp mkdir -p /hive/data/outside/dbSNP/142/mouse_mm10 cd /hive/data/outside/dbSNP/142/mouse_mm10 # Look at the directory listing of ftp://ftp.ncbi.nih.gov/snp/database/organism_data/ # to find the subdir name to use as orgDir below (mouse_10090 in this case). # Then click into that directory and look for file names like # b(1[0-9][0-9])_ # -- use the first num for build setting in config.ra # The buildAssembly setting in config.ra is empty because dbSNP stopped including # that in file names. cat > config.ra <<EOF db mm10 orgDir mouse_10090 build 142 buildAssembly refAssemblyLabel GRCm38.p2 ncbiAssemblyReportFile GCF_000001635.22.assembly.txt ignoreDbSnpContigsFile dbSnpContigsNotInUcsc.txt liftUp suggested.lft EOF #actually ran the script a few times to get the above config.ra with values suggested ~/kent/src/hg/utils/automation/doDbSnp.pl config.ra >& do.log & tail -f do.log tail -f do.log # *** All done! ############################################################################## # FILTER SNP142 (DONE 2015-11-21 braney) cd /hive/data/outside/dbSNP/142/mouse_mm10 zcat snp142.bed.gz \ | ~/kent/src/hg/utils/automation/categorizeSnps.pl #Mult: 3276456 #Common: 8213470 #Flagged: 0 #leftover: 70731318 foreach f ({Mult,Common}.bed.gz) mv $f snp142$f end # Load tables foreach subset (Mult Common) hgLoadBed -tab -onServer -tmpDir=/data/tmp -allowStartEqualEnd -renameSqlTable \ mm10 snp142$subset -sqlTable=snp142.sql snp142$subset.bed.gz end ############################################################################## # DBSNP CODING ANNOTATIONS (142) (DONE 2015-11-21 braney) cd /hive/data/outside/dbSNP/142/mouse-mm10 # ncbiFuncAnnotations.txt has NCBI coords: 0-based, fully closed. # For anything except an insertion (0 bases between flanks), # we need to add 1 to the end coord. For an insertion, we need # to add 1 to the start coord. Make a hash of the insertion IDs, # then look up each ID in ncbiFuncAnnotations.txt to tell which # transform to apply. # Note: sort -u with the keys below is too restrictive -- we need full line uniq. zcat ncbiFuncAnnotations.txt.gz \ | perl -we 'open($IDS, "zcat ncbiFuncInsertions.ctg.bed.gz |") || die "ids: $!"; \ while (<$IDS>) { chomp; $ids{$_} = 1; } \ close($IDS); \ %coding = (2=>1, 3=>1, 4=>1, 8=>1, 9=>1, 41=>1, 42=>1, 43=>1, 44=>1, 45=>1); \ while (<>) { \ chomp; @w = split("\t"); # id, ctg, start, end, ... \ next unless $coding{$w[5]}; \ $bed4 = join("\t", $w[1], $w[2], $w[3], $w[0]); \ if (exists $ids{$bed4} && $w[3] == $w[2]+1) { \ $w[2]++; # 2-base insertions: increment start coord \ } else { \ $w[3]++; # increment end coord to get half-open \ } \ print join("\t", @w) . "\n"; \ }' \ | sort -k1n,1n -k2,2 -k3n,3n -k5,5 -k6n,6n \ | uniq \ > ncbiCodingAnnotations.txt wc -l ncbiCodingAnnotations.txt #3854299 ncbiCodingAnnotations.txt # How many & what kinds of function types? cut -f 6 ncbiCodingAnnotations.txt \ | sort -n | uniq -c # 1258578 3 (coding-synon) # 1882006 8 (cds-reference -- ignored) # 4717 41 (nonsense) # 624020 42 (missense) # 745 43 (stop-loss) # 14806 44 (frameshift) # 69427 45 (cds-indel) # In b142, the functional annotations include non-coding (frame = NULL), # which we'll exclude here because this is supposed to be just coding stuff... # probably need to update how we show dbSNP's func annos anyway, e.g. # it is a shame that we toss out codon number and transcript offset. # Gather up multiple annotation lines into one line per {snp, gene, frame}: perl -e 'while (<>) { chomp; \ my ($rsId, $ctg, $s, $e, $txId, $fxn, $frm, $nt, $aa, $codon) = split("\t"); \ next if ($fxn == 8 && ($frm eq "NULL" && $aa eq "NULL" && $codon eq "NULL")); \ if (defined $lastRs && \ ($lastRs != $rsId || $lastCtg ne $ctg || $lastS != $s || \ $lastTx ne $txId || $lastFrm ne $frm)) { \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut; \ $refRow = undef; @rows = (); ($count, $fxns, $nts, $codons, $aas) = (); \ } \ ($lastRs, $lastCtg, $lastS, $lastE, $lastTx, $lastFrm) = \ ($rsId, $ctg, $s, $e, $txId, $frm); \ $count++; \ if ($fxn == 8) { \ $refRow = [$fxn, $nt, $aa, $codon]; \ } else { \ $fxns .= "$fxn,"; $nts .= "$nt,"; $aas .= "$aa,"; $codons .= "$codon,"; \ } \ } \ if (defined $refRow) { \ $fxns = "$refRow->[0],$fxns"; $nts = "$refRow->[1],$nts"; \ $aas = "$refRow->[2],$aas"; $codons = "$refRow->[3],$codons"; \ } \ $lineOut = "$lastCtg\t$lastS\t$lastE\trs$lastRs\t$lastTx\t$lastFrm\t" . \ "$count\t$fxns\t$nts\t$codons\t$aas\n"; \ $lineOut =~ s@NULL@n/a@g; \ print $lineOut;' \ ncbiCodingAnnotations.txt \ | liftUp snp142CodingDbSnp.bed suggested.lft warn stdin hgLoadBed mm10 snp142CodingDbSnp -sqlTable=$HOME/kent/src/hg/lib/snp125Coding.sql \ -renameSqlTable -tab -notItemRgb -allowStartEqualEnd \ snp142CodingDbSnp.bed #Read 1951211 elements of size 11 from snp142CodingDbSnp.bed ############################################################################## # SNPMASKED SEQUENCE FOR SNP142 (DONE 2015-11-21 braney) mkdir /hive/data/genomes/mm10/snp142Mask cd /hive/data/genomes/mm10/snp142Mask # Identify rsIds with various problems -- we will exclude those. zcat /hive/data/outside/dbSNP/142/mouse_mm10/snp142.bed.gz \ | awk '$18 ~ /MultipleAlignments|ObservedTooLong|ObservedWrongFormat|ObservedMismatch|MixedObserved/ {print $4;}' \ | sort -u \ > snp142ExcludeRsIds.txt zcat /hive/data/outside/dbSNP/142/mouse_mm10/snp142.bed.gz \ | grep -vFwf snp142ExcludeRsIds.txt \ > snp142Cleaned.bed wc -l snp142Cleaned.bed #76837455 snp142Cleaned.bed # Substitutions: mkdir substitutions snpMaskSingle snp142Cleaned.bed /hive/data/genomes/mm10/mm10.2bit stdout diffObserved.txt \ | faSplit byname stdin substitutions/ #Masked 66976283 snps in 66976283 out of 2729124706 genomic bases # /hive/data/genomes/mm10/mm10.2bit has 2730871774 total bases, #but the total number of bases in sequences for which we masked snps is 2729124706 (difference is 1747068) # Check that 1747068 is the total #bases in sequences with nothing in snp142Cleaned: grep -Fw single snp142Cleaned.bed | cut -f 1 | uniq > /data/tmp/1 grep -vwf /data/tmp/1 ../chrom.sizes \ | awk 'BEGIN {TOTAL = 0;} {TOTAL += $2;} END {printf "%d\n", TOTAL;}' #1726860 calc 1747068-1726860 #20208 # warnings about differing observed strings at same base position: wc -l diffObserved.txt #2 diffObserved.txt # peanuts! good. # Make sure that sizes are identical, first diffs are normal -> IUPAC, # and first diffs' case is preserved: mkdir tmpFa cd tmpFa twoBitToFa /hive/data/genomes/mm10/mm10.2bit stdout | faSplit byname stdin tmpFa cd .. foreach f (substitutions/chr*.fa.gz) faCmp $f tmpFa/`basename $f subst.fa.gz`fa |& grep -v "that differ" end #chr1 in substitutions/chr1.fa differs from chr1 at ../1/chr1.fa at base 10107 (y != c) #chr10 in substitutions/chr10.fa differs from chr10 at ../10/chr10.fa at base 60493 (R != A) #... #(output OK -- ambiguous bases replacing [agct] at SNP positions) foreach f (substitutions/chr*.fa) echo $f:t:r mv $f $f:r.subst.fa end # Fire off a bunch of gzip jobs in parallel: ls -1 substitutions/*.fa | split -l 5 foreach f (x??) gzip `cat $f` & end # Wait for backgrounded gzip jobs to complete rm x?? # Insertions & deletions not done. To date we have only offered substs for download. # If there is user demand, use template from snp131 above. # Clean up and prepare for download: gzip snp142Cleaned.bed & foreach d (substitutions) pushd $d md5sum *.gz > md5sum.txt cp /hive/data/genomes/hg38/snp142Mask/$d/README.txt . popd end # Edit the README.txt. # Create download links on hgwdev. mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/snp142Mask ln -s /hive/data/genomes/mm10/snp142Mask/substitutions/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/snp142Mask/ ############################################################################## # LASTZ Rhesus rheMac8 (DONE - 2016-02-10 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac8.2016-02-10 cd /hive/data/genomes/mm10/bed/lastzRheMac8.2016-02-10 printf '# rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac8 SEQ2_DIR=/hive/data/genomes/rheMac8/rheMac8.2bit SEQ2_LEN=/hive/data/genomes/rheMac8/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzRheMac8.2016-02-10 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10RheMac8 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 239m18.376s cat fb.mm10.chainRheMac8Link.txt # 918841829 bases of 2652783500 (34.637%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 rheMac8) > rbest.log 2>&1 & # real 421m31.807s mkdir /hive/data/genomes/rheMac8/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac8/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac8.2016-02-10/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 110m33.219s cat fb.rheMac8.chainMm10Link.txt # 917131079 bases of 3142093174 (29.189%) in intersection time (doRecipBest.pl -buildDir=`pwd` rheMac8 mm10) > rbest.log 2>&1 # real 409m8.252s ############################################################################## # Patents (26 Feb 2016, Max) # convert SAM to BED cd /hive/data/genomes/hg19/bed/patents/data/ samtools view -S -t ensGenomeMm10/Mus_musculus.GRCm38.75.dna.toplevel.fa.fai Mus_musculus.GRCm38.75.s90c50.sam -h > mm10.sam # convert to bed function sam2psl_pierre() { java -Dfile.encoding=UTF8 -Xmx500m -cp "/cluster/bin/jvarkit/htsjdk-1.133/dist/commons-jexl-2.1.1.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/commons-logging-1.1.1.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/htsjdk-1.133.jar:/cluster/bin/jvarkit/htsjdk-1.133/dist/snappy-java-1.0.3-rc3.jar:/cluster/bin/jvarkit/dist-1.133/sam2psl.jar" com.github.lindenb.jvarkit.tools.misc.SamToPsl $*; } sam2psl_pierre mm10.sam 2> /dev/null > mm10.psl pslToBed mm10.psl mm10.bed # strip the BAM flag field from the BED name # careful: this line includes tab characters sed -ri 's/_(16|0) / /g' mm10.bed # now join meta with bed file cd ../mm10 sort by name # The -S10G parameter is only supported in newer sort versions # # if it complains, just remove it. It will just take longer. time sort -k4,4 -S10G --parallel=20 mm10.bed > mm10.s4.bed join -t $'\t' -1 4 -2 1 ../data/mm10.s4.bed ../data/seqAndPatentSummary.tab -o '1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 1.12 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 2.10 2.11 2.12' | patSeqFilterBulkAndAnnotate ../data/htPatents.txt patBulk.bed patNonBulk.bed -c ../data/seqCounts.tab bedSort patNonBulk.bed patNonBulk.bed bedSort patBulk.bed patBulk.bed bedToBigBed patNonBulk.bed /cluster/data/genomes/mm10/chrom.sizes patNonBulk.bb -tab -as=../patSummary.as -type=bed12+ bedToBigBed patBulk.bed /cluster/data/genomes/mm10/chrom.sizes patBulk.bb -tab -as=../patSummary.as -type=bed12+ hgBbiDbLink hg19 patBulk /gbdb/hg19/bbi/patBulk.bb hgBbiDbLink hg19 patNonBulk /gbdb/hg19/bbi/patNonBulk.bb ######################################################################### 2016-03-14: import of UCSC GENCODE group processing of GENCODE VM8 (markd) # not to be pushed to RR # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM8 cd /hive/data/genomes/mm10/bed/gencodeVM8 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM7.ra mouse/mm10/wgEncodeGencodeVM8.ra cp mouse/mm10/wgEncodeGencodeVM7.html mouse/mm10/wgEncodeGencodeVM8.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM8.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM8 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### 2016-03-14: import of UCSC GENCODE group processing of GENCODE VM9 (markd) # download files mkdir -p /hive/data/genomes/mm10/bed/gencodeVM9 cd /hive/data/genomes/mm10/bed/gencodeVM9 # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM8.ra mouse/mm10/wgEncodeGencodeVM9.ra cp mouse/mm10/wgEncodeGencodeVM8.html mouse/mm10/wgEncodeGencodeVM9.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM9.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM9 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################## # LASTZ Rat rn6 (DONE - 2016-04-09 - Jonathan) mkdir /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 cd /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 printf '# rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat Rn6 SEQ2_DIR=/hive/data/genomes/rn6/rn6.2bit SEQ2_LEN=/hive/data/genomes/rn6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzRn6.2016-04-07 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10Rn6 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=medium) > do.log 2>&1 # real 501m43.495s cat fb.mm10.chainRn6Link.txt # 1880453869 bases of 2652783500 (70.886%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 rn6) > rbest.log 2>&1 & # real 766m50.090s mkdir /hive/data/genomes/rn6/bed/blastz.mm10.swap cd /hive/data/genomes/rn6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRn6.2016-04-07/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=medium) > swap.log 2>&1 # real 234m59.393s cat fb.rn6.chainMm10Link.txt # 1938597957 bases of 2729860805 (71.015%) in intersection time (doRecipBest.pl -buildDir=`pwd` rn6 mm10) > rbest.log 2>&1 # real 882m38.624s ######################################################################### ## 4-Way Multiz for UCSC Genes construction (TBD - 2016-04-06 - Jonathan) # mm10, hg38, canFam3, rn6 mkdir /hive/data/genomes/mm10/bed/multiz4way cd /hive/data/genomes/mm10/bed/multiz4way # extract a tree for the 4 we need /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,mm10,canFam3,rn6 $HOME/kent/src/hg/utils/phyloTrees/191way.nh > 4way.nh # this looks like: ((hg38:0.145908,(mm10:0.084509,rn6:0.091589):0.271974):0.020593,canFam3:0.165928); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a gif image for htdocs/images/phylo/mm10_4way.gif /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt # Use this output to create the table below grep -i mm10 4way.distances.txt | sort -k3,3n # # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # # featureBits chainLink measures # chainMm10Link chain linearGap # distance on mm10 on other minScore # 1 0.176098 - rat rn6 (% 70.886) (% 71.015) 5000 medium # 2 0.502391 - human hg38 (% 35.372) (% 31.653) 3000 medium # 3 0.543004 - dog canFam3 (% 29.144) (% 31.624) 3000 medium # using the syntenic nets cd /cluster/data/mm10/bed/multiz4way mkdir mafLinks cd mafLinks mkdir rn6 canFam3 hg38 for D in hg38 canFam3 rn6 do cd $D ln -s ../../../lastz.${D}/mafSynNet/*.maf.gz ./ cd .. done # determine what is the newest version of multiz and use that cd /hive/data/genomes/mm10/bed/multiz4way mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # the autoMultiz cluster run ssh ku cd /hive/data/genomes/mm10/bed/multiz4way # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ 4way.nh > tmp.nh echo `cat tmp.nh` | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.lst mkdir run maf cd run # NOTE: you need to set the db and multiz dirname properly in this # script cat > autoMultiz << '_EOF_' #!/bin/csh -ef set db = mm10 set c = $1 set maf = $2 set binDir = /hive/data/genomes/mm10/bed/multiz4way/penn set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/multiz4way/mafLinks rm -fr $tmp mkdir -p $tmp cp ../{tree.nh,species.lst} $tmp pushd $tmp foreach s (`cat species.lst`) set in = $pairs/$s/$c.maf set out = $db.$s.sing.maf if ($s == $db) then continue endif if (-e $in.gz) then zcat $in.gz > $out else if (-e $in) then cp $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($binDir $path); rehash $binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf popd cp $tmp/$c.maf $maf rm -fr $tmp '_EOF_' # << happy emacs chmod +x autoMultiz cat << '_EOF_' > template #LOOP ./autoMultiz $(root1) {check out line+ /hive/data/genomes/mm10/bed/multiz4way/maf/$(root1).maf} #ENDLOOP '_EOF_' # << happy emacs cut -f1 /cluster/data/mm10/chrom.sizes > chrom.lst gensub2 chrom.lst single template jobList para create jobList # 66 jobs para try ... check ... push ... etc ... # Completed: 66 of 66 jobs # CPU time in finished jobs: 34495s 574.91m 9.58h 0.40d 0.001 y # IO & Wait Time: 826s 13.77m 0.23h 0.01d 0.000 y # Average job time: 535s 8.92m 0.15h 0.01d # Longest finished job: 2765s 46.08m 0.77h 0.03d # Submission to last job: 2776s 46.27m 0.77h 0.03d # combine results into a single file for loading and gbdb reference cd /hive/data/genomes/mm10/bed/multiz4way grep "^#" maf/chr1_GL456210_random.maf | grep -v "eof maf" > multiz4way.maf grep -h -v "^#" maf/*.maf >> multiz4way.maf grep "^#" maf/chr1_GL456210_random.maf | grep "eof maf" >> multiz4way.maf # makes a 6.5 Gb file: # -rw-rw-r-- 1 6928752890 Apr 12 10:18 multiz4way.maf # Load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/multiz4way mkdir /gbdb/mm10/multiz4way ln -s /hive/data/genomes/mm10/bed/multiz4way/multiz4way.maf \ /gbdb/mm10/multiz4way # the hgLoadMaf generates huge tmp files, locate them in /dev/shm cd /dev/shm time nice -n +19 hgLoadMaf mm10 multiz4way # Loaded 5300158 mafs in 1 files from /gbdb/mm10/multiz4way # real 1m41.656s cd /hive/data/genomes/mm10/bed/multiz4way time (cat /gbdb/mm10/multiz4way/*.maf \ | hgLoadMafSummary -verbose=2 -minSize=10000 \ -mergeGap=500 -maxSize=50000 mm10 multiz4waySummary stdin) # Created 1310955 summary blocks from 9774995 components and 5300158 mafs # real 2m27.913s mv /dev/shm/multiz4way.tab . # -rw-rw-r-- 1 277435502 Apr 12 12:11 multiz4way.tab # -rw-rw-r-- 1 59271980 Apr 12 12:16 multiz4waySummary.tab wc -l multiz4way*.tab # 5300158 multiz4way.tab # 1310955 multiz4waySummary.tab # 6611113 total ######################################################################### # LASTZ mouse/mm10 vs. chicken/galGal5 - (DONE - 2016-04-20 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 cd /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 printf "# Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal5 SEQ2_DIR=/hive/data/genomes/galGal5/galGal5.2bit SEQ2_LEN=/hive/data/genomes/galGal5/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 112m25.946s cat fb.mm10.chainGalGal5Link.txt # 102343350 bases of 2652783500 (3.858%) in intersection time (doRecipBest.pl -buildDir=`pwd` mm10 galGal5) > rbest.log 2>&1 & # real 170m24.948s # and for the swap: mkdir /hive/data/genomes/galGal5/bed/blastz.mm10.swap cd /hive/data/genomes/galGal5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal5.2016-04-20/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 12m17.175s cat fb.galGal5.chainMm10Link.txt # 95753452 bases of 1218501075 (7.858%) in intersection time (doRecipBest.pl -buildDir=`pwd` galGal5 mm10) > rbest.log 2>&1 # real 138m37.610s ######################################################################### # LASTZ mouse/mm10 vs. Malayan flying lemur/galVar1 - (DONE - 2016-04-26 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 cd /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 printf "# mouse vs Malayan flying lemur BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_O=400 BLASTZ_E=30 BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Malayan flying lemur galVar1 SEQ2_DIR=/hive/data/genomes/galVar1/galVar1.2bit SEQ2_LEN=/hive/data/genomes/galVar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=400 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26 TMPDIR=/dev/shm " > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 340m23.106s cat fb.mm10.chainGalVar1Link.txt # 944876157 bases of 2652783500 (35.618%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 galVar1) \ > rbest.log 2>&1 & # real 694m27.183s # and for the swap: mkdir /hive/data/genomes/galVar1/bed/blastz.mm10.swap cd /hive/data/genomes/galVar1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalVar1.2016-04-26/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 173m45.678s cat fb.galVar1.chainMm10Link.txt # 1008272821 bases of 2802917674 (35.972%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` galVar1 mm10) \ > rbest.log 2>&1 # real 856m16.458s ######################################################################### # lastz Chinese softshell turtle pelSin1 (DONE - 2016-05-10 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10PelSin1 mkdir /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 cd /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 printf '# Mouse vs. Chinese softshell turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Chinese softshell turtle pelSin1 SEQ2_DIR=/hive/data/genomes/pelSin1/pelSin1.2bit SEQ2_LEN=/hive/data/genomes/pelSin1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=100 BASE=/hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 156m43.981s cat fb.mm10.chainPelSin1Link.txt # 113023930 bases of 2652783500 (4.261%) in intersection # forgot to include syntenicNet: time (doBlastzChainNet.pl -verbose=2 \ -continue=syntenicNet -syntenicNet `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > synNet.log 2>&1 & # real 2m9.196s time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 pelSin1) \ > rbest.log 2>&1 & # real 221m37.947s # and for the swap mkdir /hive/data/genomes/pelSin1/bed/blastz.mm10.swap cd /hive/data/genomes/pelSin1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPelSin1.2016-05-10/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \ > swap.log 2>&1 # real 16m3.703s cat fb.pelSin1.chainMm10Link.txt # 102485355 bases of 2106639384 (4.865%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` pelSin1 mm10) \ > rbest.log 2>&1 # real 198m33.448s ######################################################################### # LASTZ mouse/mm10 Gorilla/panPan2 - (DONE - 2016-05-24 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 cd /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 printf '# mouse vs bonobo BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: bonobo panPan2 SEQ2_DIR=/hive/data/genomes/panPan2/panPan2.2bit SEQ2_LEN=/hive/data/genomes/panPan2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 360m9.534s cat fb.mm10.chainPanPan2Link.txt # 928638440 bases of 2652783500 (35.006%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 panPan2) \ > rbest.log 2>&1 & # real 765m26.648s # and for the swap: mkdir /hive/data/genomes/panPan2/bed/blastz.mm10.swap cd /hive/data/genomes/panPan2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanPan2.2016-05-24/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 106m54.032s cat fb.panPan2.chainMm10Link.txt # 911279510 bases of 2725937399 (33.430%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panPan2 mm10) \ > rbest.log 2>&1 # real 620m0.039s ######################################################################### 2016-07-22: import of UCSC GENCODE group processing of GENCODE VM10 (markd) # will not be pushed to the RR. # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM10 pushd /hive/data/genomes/mm10/bed/gencodeVM10 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. Results are in gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM9.ra mouse/mm10/wgEncodeGencodeVM10.ra cp mouse/mm10/wgEncodeGencodeVM9.html mouse/mm10/wgEncodeGencodeVM10.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM10.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. [ONLY if it's going to be pushed] # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM10 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ######################################################################### # LASTZ mouse/mm10 Chimp/panTro5 - (DONE - 2016-08-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 cd /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 printf '# mouse vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: chimp panTro5 SEQ2_DIR=/hive/data/genomes/panTro5/panTro5.2bit SEQ2_LEN=/hive/data/genomes/panTro5/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 273m27.335s cat fb.mm10.chainPanTro5Link.txt # 935711523 bases of 2652783500 (35.273%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 panTro5) \ > rbest.log 2>&1 & # real 624m28.225s # and for the swap: mkdir /hive/data/genomes/panTro5/bed/blastz.mm10.swap cd /hive/data/genomes/panTro5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro5.2016-08-03/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 98m32.623s cat fb.panTro5.chainMm10Link.txt # 965636631 bases of 3132620660 (30.825%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` panTro5 mm10) \ > rbest.log 2>&1 # real 560m21.432s ######################################################################### # Crispr track. See ../crisprTrack/README.txt (2016-09-15 max) # Command: doCrispr.sh mm10 ensGene ############################################################################## ######################################################################### 2016-10-27: import of UCSC GENCODE group processing of GENCODE VM11 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM11 cd /hive/data/genomes/mm10/bed/gencodeVM11 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk cmpRelease >gencode-cmp.tsv ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM8.ra mouse/mm10/wgEncodeGencodeVM11.ra cp mouse/mm10/wgEncodeGencodeVM8.html mouse/mm10/wgEncodeGencodeVM11.html # edit these plus mouse/mm10/trackDb.wgEncode.ra # - set priorities in wgEncodeGencodeVM11.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM11 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################## 2016-12-08: import of UCSC GENCODE group processing of GENCODE VM12 (markd) No being pushed to RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM12 cd /hive/data/genomes/mm10/bed/gencodeVM12 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv- check to see if sizes make sense ## Copy and update trackDb files from previous release. ## Change version and use lower priority so it sorts to top of ## super track page. Follow instructions in ra file to ensure ## filters are correct. cd kent/src/hg/makeDb/trackDb cp mouse/mm10/wgEncodeGencodeVM11.ra mouse/mm10/wgEncodeGencodeVM12.ra cp mouse/mm10/wgEncodeGencodeVM11.html mouse/mm10/wgEncodeGencodeVM12.html # edit these plus mouse/mm10/trackDb.ra # - set priorities in wgEncodeGencodeVM12.ra tracks so newest shows up first # priority - set to previous version priority minus 0.001 # searchPriority - set each to previous minus 0.001 # - make current track default to pack and hide previous [ONLY if it's going to be pushed] # superTrack wgEncodeGencodeSuper pack # - Update wgEncodeGencodeSuper.html to describe new release and to # pick up other updates. # DID NOT UPDATE all.joiner SINCE NOT BEING PUSHED PUBLIC # update all.joiner and validate # look for the last section `begin Gencode V??' in all.joiner # and copy and update version # repeat this until happy, editing minCheck as needed # output in check/joiner.out cd /hive/data/genomes/mm10/bed/gencodeVM12 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck ############################################################################################ # Mouse strains VCF (DONE - 2016-11-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/strainsVCF cd /hive/data/genomes/mm10/bed/strainsVCF # download files: wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz.tbi wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.snps_all.dbSNP142.vcf.gz.tbi wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.indels.dbSNP142.normed.vcf.gz wget --timestamping \ ftp://ftp-mouse.sanger.ac.uk/REL-1505-SNPs_Indels/mgp.v5.merged.snps_all.dbSNP142.vcf.gz # change to UCSC chrom names: zcat mgp.v5.merged.snps_all.dbSNP142.vcf.gz \ | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \ > ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf # need to fixup the chrom names in the header, extract the header: grep "^#" ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf > original.header.txt # copy that and edit it to fixup the names: cp original.header.txt ucscNames.header.txt # extract the lines not in the header grep -v "^#" ucscNames.mgp.v5.merged.snps_all.dbSNP142.vcf > ucscNames.notHeader.txt # put it back together: cat ucscName.header.txt ucscNames.notHeader.txt > ucsc.mgpV5MergedSNPsAlldbSNP142.vcf # tabix gzip (about 2 hours) export name="ucsc.mgpV5MergedSNPsAlldbSNP142.vcf" /cluster/bin/tabix-0.2.6/bgzip $name /cluster/bin/tabix-0.2.6/tabix -p vcf $name.gz.tbi # symlink to gbdb mkdir /gbdb/mm10/mouseStrains ln -s `pwd`/ucsc.mgpV5MergedSNPsAlldbSNP142.vcf.gz \ /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz ln -s `pwd`/ucsc.mgpV5MergedSNPsAlldbSNP142.vcf.gz.tbi \ /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz.tbi hgBbiDbLink mm10 strainSNPs /gbdb/mm10/mouseStrains/mgpV5MergedSNPsAlldbSNP142.vcf.gz # trackDb entry in trackDb/mouse/mm10/trackDb.ra: track strainSNPs shortLabel Mouse SNPs longLabel Annotated SNPs from mouse strain comparison analysis group varRep type vcfTabix visibility hide hapClusterHeight 78 ############################################################################# # lastz turkey melGal5 (DONE - 2017-01-19 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10MelGal5 mkdir /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 cd /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 printf '# Mouse vs. turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turkey melGal5 SEQ2_DIR=/hive/data/genomes/melGal5/melGal5.2bit SEQ2_LEN=/hive/data/genomes/melGal5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 160m46.030s cat fb.mm10.chainMelGal5Link.txt # 94675126 bases of 2652783500 (3.569%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 melGal5) \ > rbest.log 2>&1 & # real 379m35.317s # and for the swap mkdir /hive/data/genomes/melGal5/bed/blastz.mm10.swap cd /hive/data/genomes/melGal5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzMelGal5.2017-01-19/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 31m37.466s cat fb.melGal5.chainMm10Link.txt # 81470789 bases of 1093044709 (7.454%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` melGal5 mm10) \ > rbest.log 2>&1 # real 356m16.099s ############################################################################# # LASTZ mouse/mm10 Pig-tailed macaque/macNem1 - (DONE - 2017-02-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 cd /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 printf '# mouse vs Pig-tailed macaque BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Pig-tailed macaque macNem1 SEQ2_DIR=/hive/data/genomes/macNem1/macNem1.2bit SEQ2_LEN=/hive/data/genomes/macNem1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 370m19.213s cat fb.mm10.chainMacNem1Link.txt # 918083212 bases of 2652783500 (34.608%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 macNem1) \ > rbest.log 2>&1 & # real 344m11.369s # and for the swap: mkdir /hive/data/genomes/macNem1/bed/blastz.mm10.swap cd /hive/data/genomes/macNem1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMacNem1.2017-02-28/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 65m14.074s cat fb.macNem1.chainMm10Link.txt # 905682728 bases of 2838503083 (31.907%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` macNem1 mm10) \ > rbest.log 2>&1 # real 321m2.285s ############################################################################# # LASTZ mouse/mm10 Angolan colobus/colAng1 - (DONE - 2017-02-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 cd /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 printf '# mouse vs Angolan colobus BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Angolan colobus colAng1 SEQ2_DIR=/hive/data/genomes/colAng1/colAng1.2bit SEQ2_LEN=/hive/data/genomes/colAng1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=30 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 376m8.949s cat fb.mm10.chainColAng1Link.txt # 902325064 bases of 2652783500 (34.014%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 colAng1) \ > rbest.log 2>&1 & # real 343m38.692s # and for the swap: mkdir /hive/data/genomes/colAng1/bed/blastz.mm10.swap cd /hive/data/genomes/colAng1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzColAng1.2017-02-28/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 62m44.125s cat fb.colAng1.chainMm10Link.txt # 885418780 bases of 2679973137 (33.038%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` colAng1 mm10) \ > rbest.log 2>&1 # real 296m19.689s ############################################################################# # LASTZ mouse/mm10 Gray mouse lemur/micMur3 - (DONE - 2017-03-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 cd /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 printf '# mouse vs Gray mouse lemur BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=4 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Gray mouse lemur micMur3 SEQ2_DIR=/hive/data/genomes/micMur3/micMur3.2bit SEQ2_LEN=/hive/data/genomes/micMur3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 2192m13.661s cat fb.mm10.chainMicMur3Link.txt # 907817373 bases of 2652783500 (34.221%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 micMur3) \ > rbest.log 2>&1 & # real 522m5.587s # and for the swap: mkdir /hive/data/genomes/micMur3/bed/blastz.mm10.swap cd /hive/data/genomes/micMur3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzMicMur3.2017-03-03/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 71m4.702s cat fb.micMur3.chainMm10Link.txt # 905011854 bases of 2386321975 (37.925%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` micMur3 mm10) \ > rbest.log 2>&1 # real 508m58.716s ############################################################################# # LASTZ mouse/mm10 Gray mouse lemur/tupChi1 - (DONE - 2017-03-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 cd /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 printf '# mouse vs Chinese tree shrew BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=4 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: Chinese tree shrew tupChi1 SEQ2_DIR=/hive/data/genomes/tupChi1/tupChi1.2bit SEQ2_LEN=/hive/data/genomes/tupChi1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 224m24.608s cat fb.mm10.chainTupChi1Link.txt # 683463709 bases of 2652783500 (25.764%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 tupChi1) \ > rbest.log 2>&1 & # real 385m2.239s # and for the swap: mkdir /hive/data/genomes/tupChi1/bed/blastz.mm10.swap cd /hive/data/genomes/tupChi1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzTupChi1.2017-03-09/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 71m4.702s cat fb.tupChi1.chainMm10Link.txt # 708757944 bases of 2706389135 (26.188%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` tupChi1 mm10) \ > rbest.log 2>&1 # real 508m10.564s ############################################################################# # LASTZ mouse/mm10 Chinese pangolin/manPen1 - (DONE - 2017-03-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 cd /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 printf '# Mouse vs. Chinese softshell turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Chinese pangolin manPen1 SEQ2_DIR=/hive/data/genomes/manPen1/manPen1.2bit SEQ2_LEN=/hive/data/genomes/manPen1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=180 BASE=/hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > do.log 2>&1 # real 404m9.925s cat fb.mm10.chainManPen1Link.txt # 724400544 bases of 2652783500 (27.307%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 manPen1) \ > rbest.log 2>&1 & # real 499m21.668s # and for the swap: mkdir /hive/data/genomes/manPen1/bed/blastz.mm10.swap cd /hive/data/genomes/manPen1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -swap \ /hive/data/genomes/mm10/bed/lastzManPen1.2017-03-15/DEF \ -syntenicNet -fileServer=hgwdev \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku) > swap.log 2>&1 # real 71m4.702s cat fb.manPen1.chainMm10Link.txt # 710179682 bases of 1999066070 (35.526%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` manPen1 mm10) \ > rbest.log 2>&1 # real 495m7.361s ############################################################################# # LASTZ mouse/mm10 vs. Golden eagle/aquChr2 - (DONE - 2017-03-16 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 cd /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 printf "# Mouse vs. Golden eagle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Golden eagle aquChr2 SEQ2_DIR=/hive/data/genomes/aquChr2/aquChr2.2bit SEQ2_LEN=/hive/data/genomes/aquChr2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 217m29.467s cat fb.mm10.chainAquChr2Link.txt # 105013175 bases of 2652783500 (3.959%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 aquChr2) \ > rbest.log 2>&1 & # real 196m24.435s # and for the swap: mkdir /hive/data/genomes/aquChr2/bed/blastz.mm10.swap cd /hive/data/genomes/aquChr2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAquChr2.2017-03-16/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 9m16.569s cat fb.aquChr2.chainMm10Link.txt # 89023131 bases of 1180019022 (7.544%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` aquChr2 mm10) \ > rbest.log 2>&1 # real 132m43.886s ######################################################################### # LASTZ bison bisBis1 (DONE - 2017-03-17 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 cd /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 printf '# Mouse vs. Bison BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # maximum M allowed with lastz is only 254 BLASTZ_M=254 # TARGET: Mouse mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=50 # QUERY: bison bisBis1 SEQ2_DIR=/hive/data/genomes/bisBis1/bisBis1.2bit SEQ2_LEN=/hive/data/genomes/bisBis1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=900 BASE=/hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 576m23.128s cat fb.mm10.chainBisBis1Link.txt # 688337604 bases of 2652783500 (25.948%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 bisBis1) \ > rbest.log 2>&1 & # real 430m48.078s # and the swap mkdir /hive/data/genomes/bisBis1/bed/blastz.mm10.swap cd /hive/data/genomes/bisBis1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBisBis1.2017-03-17/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 169m28.369s cat fb.bisBis1.chainMm10Link.txt # 682104798 bases of 2757854331 (24.733%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` bisBis1 mm10) \ > rbest.log 2>&1 # real 445m5.636s ############################################################################ # lastz frog xenTro9 (DONE - 2017-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenTro9 mkdir /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 cd /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 printf '# Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: frog xenTro9 SEQ2_DIR=/hive/data/genomes/xenTro9/xenTro9.2bit SEQ2_LEN=/hive/data/genomes/xenTro9/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) \ > do.log 2>&1 & # real 806m23.459s cat fb.mm10.chainXenTro9Link.txt # 87053836 bases of 2652783500 (3.282%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 xenTro9) \ > rbest.log 2>&1 & # real 617m41.376s # and for the swap mkdir /hive/data/genomes/xenTro9/bed/blastz.mm10.swap cd /hive/data/genomes/xenTro9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenTro9.2017-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -swap -chainMinScore=5000 -chainLinearGap=loose) \ > swap.log 2>&1 & # real 25m54.516s cat fb.xenTro9.chainMm10Link.txt # 90150612 bases of 1369865365 (6.581%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenTro9 mm10) \ > rbest.log 2>&1 & # real 597m52.740s ######################################################################### # lastz frog xenLae2 (DONE - 2017-03-29 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10XenLae2 mkdir /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 cd /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 printf '# Mouse vs. frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # QUERY: frog xenLae2 SEQ2_DIR=/hive/data/genomes/xenLae2/xenLae2.2bit SEQ2_LEN=/hive/data/genomes/xenLae2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) \ > do.log 2>&1 & # real 1044m10.115s cat fb.mm10.chainXenLae2Link.txt # 82272699 bases of 2652783500 (3.101%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 xenLae2) \ > rbest.log 2>&1 & # real 656m46.337s # and for the swap mkdir /hive/data/genomes/xenLae2/bed/blastz.mm10.swap cd /hive/data/genomes/xenLae2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 26m14.884s time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzXenLae2.2017-03-29/DEF \ -continue=syntenicNet -workhorse=hgwdev -smallClusterHub=ku \ -bigClusterHub=ku -syntenicNet -swap -chainMinScore=5000 \ -chainLinearGap=loose) > syntenicNet.log 2>&1 & # real 1m52.642s cat fb.xenLae2.chainMm10Link.txt # 116001603 bases of 2408724787 (4.816%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenLae2 mm10) \ > rbest.log 2>&1 & # real 746m4.542s ######################################################################### # lastz turtle chrPic2 (DONE - 2017-04-05 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ChrPic2 mkdir /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 cd /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 printf '# Mouse vs. turtle BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: turtle chrPic2 SEQ2_DIR=/hive/data/genomes/chrPic2/chrPic2.2bit SEQ2_LEN=/hive/data/genomes/chrPic2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=300 BASE=/hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 865m16.816s # ku difficulties due to /dev/shm/ being full, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=cat -chainMinScore=5000 -chainLinearGap=loose) > cat.log 2>&1 & # real 13m13.959s # one big chain causing trouble, continuing: time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -continue=chainMerge -chainMinScore=5000 -chainLinearGap=loose) > chainMerge.log 2>&1 & # real 11m47.232s cat fb.mm10.chainChrPic2Link.txt # 112560591 bases of 2652783500 (4.243%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 chrPic2) \ > rbest.log 2>&1 & # real 114m27.445s # and for the swap mkdir /hive/data/genomes/chrPic2/bed/blastz.mm10.swap cd /hive/data/genomes/chrPic2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzChrPic2.2017-04-05/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 12m2.676s cat fb.chrPic2.chainMm10Link.txt # 106063993 bases of 2173204089 (4.881%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` chrPic2 mm10) \ > rbest.log 2>&1 & # real 110m9.546s ######################################################################### 2017-04-16: import of UCSC GENCODE group processing of GENCODE VM13 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM13 pushd /hive/data/genomes/mm10/bed/gencodeVM13 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M13 88 'March 2017' # edit mouse/mm10/trackDb.ra to add new .ra file include make DBS=mm10 # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit all.joiner to add ~/tmp/gencodeVM13.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM13 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all ############################################################################## # LASTZ Chinese hamster ovary cell line CHO-K1 criGriChoV1 # (DONE - 2017-04-13 - Hiram) # establish a screen to control this job screen -S mm10criGriChoV1 mkdir /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 cd /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 printf '# Chinese hamster ovary cell line vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chinese hamster ovary cell line CHO-K1 criGriChoV1 SEQ2_DIR=/hive/data/genomes/criGriChoV1/criGriChoV1.2bit SEQ2_LEN=/hive/data/genomes/criGriChoV1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=250 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -noDbNameCheck -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 575m28.254s cat fb.mm10.chainCriGriChoV1Link.txt # 1553371182 bases of 2652783500 (58.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 criGriChoV1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 732m16.081s mkdir /hive/data/genomes/criGriChoV1/bed/blastz.mm10.swap cd /hive/data/genomes/criGriChoV1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCriGriChoV1.2017-04-13/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 157m21.977s cat fb.criGriChoV1.chainMm10Link.txt # 1513594461 bases of 2318132242 (65.294%) in intersection time (doRecipBest.pl -workhorse=hgwdev criGriChoV1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 769m8.998s ############################################################################## ## 4-Way Multiz (DONE - 2017-04-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way # from the 213-way in the source tree, select out the 5 used here: /cluster/bin/phast/tree_doctor \ --prune-all-but hg38,galVar1,mm10,tupChi1 \ /cluster/home/hiram/kent/src/hg/utils/phyloTrees/213way.nh \ > mm10.4way.nh cat mm10.4way.nh # ((hg38:0.143908,(tupChi1:0.120000,galVar1:0.080000):0.054937):0.002000, mm10:0.356483); # using TreeGraph2 on Mac desktop to rearrange tree to get mm10 at top: # (mm10:0.356483,(hg38:0.143908,(tupChi1:0.12,galVar1:0.08):0.054937):0.002); # what that looks like: ~/kent/src/hg/utils/phyloTrees/asciiTree.pl mm10.4way.nh | sed -e 's/^/# /;' # (mm10:0.356483, # (hg38:0.143908, # (tupChi1:0.12, # galVar1:0.08):0.054937):0.002); # extract species list from that .nh file sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.4way.nh | xargs echo | sed 's/ //g; s/,/ /g' \ | sed 's/[()]//g; s/,/ /g' | tr '[ ]' '[\n]' > species.list.txt # construct db to name translation list: cat species.list.txt | while read DB do hgsql -N -e "select name,organism from dbDb where name=\"${DB}\";" hgcentraltest done | sed -e "s/\t/->/; s/ /_/g;" | sed -e 's/$/;/' | sed -e 's/\./_/g' \ | sed -e 's/-nosed/_nosed/; s/-eating/_eating/;' > db.to.name.txt # construct a common name .nh file: /cluster/bin/phast/tree_doctor --rename \ "`cat db.to.name.txt`" mm10.4way.nh | sed -e 's/00*)/)/g; s/00*,/,/g' \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.commonNames.nh cat mm10.4way.commonNames.nh | sed -e 's/^/# /;' # (Mouse:0.356483, # (Human:0.143908, # (Chinese_tree_shrew:0.12, # Malayan_flying_lemur:0.08):0.054937):0.002); # Use this specification in the phyloGif tool: # http://genome.ucsc.edu/cgi-bin/phyloGif # to obtain a png image for src/hg/htdocs/images/phylo/hg38_4way.png ~/kent/src/hg/utils/phyloTrees/asciiTree.pl mm10.4way.nh > t.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh t.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.scientificNames.nh rm -f t.nh cat mm10.4way.scientificNames.nh | sed -e 's/^/# /;' # (Mus_musculus:0.356483, # (Homo_sapiens:0.143908, # (Tupaia_chinensis:0.12, # Galeopterus_variegatus:0.08):0.054937):0.002); /cluster/bin/phast/all_dists mm10.4way.nh | grep mm10 \ | sed -e "s/mm10.//" | sort -k2n > 4way.distances.txt # Use this output to create the table below cat 4way.distances.txt | sed -e 's/^/# /;' # galVar1 0.493420 # hg38 0.502391 # tupChi1 0.533420 printf '#!/usr/bin/env perl use strict; use warnings; open (FH, "<4way.distances.txt") or die "can not read 4way.distances.txt"; my $count = 0; while (my $line = <FH>) { chomp $line; my ($D, $dist) = split('"'"'\\s+'"'"', $line); my $chain = "chain" . ucfirst($D); my $B="/hive/data/genomes/mm10/bed/lastz.$D/fb.mm10." . $chain . "Link.txt"; my $chainLinkMeasure = `awk '"'"'{print \\$5}'"'"' ${B} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $chainLinkMeasure; $chainLinkMeasure = 0.0 if (length($chainLinkMeasure) < 1); $chainLinkMeasure =~ s/\\%%//; my $swapFile="/hive/data/genomes/${D}/bed/lastz.mm10/fb.${D}.chainMm10Link.txt"; my $swapMeasure = "N/A"; if ( -s $swapFile ) { $swapMeasure = `awk '"'"'{print \\$5}'"'"' ${swapFile} 2> /dev/null | sed -e "s/(//; s/)//"`; chomp $swapMeasure; $swapMeasure = 0.0 if (length($swapMeasure) < 1); $swapMeasure =~ s/\\%%//; } my $orgName= `hgsql -N -e "select organism from dbDb where name='"'"'$D'"'"';" hgcentraltest`; chomp $orgName; if (length($orgName) < 1) { $orgName="N/A"; } ++$count; printf "# %%02d %%.4f (%%%% %%06.3f) (%%%% %%06.3f) - %%s %%s\\n", $count, $dist, $chainLinkMeasure, $swapMeasure, $orgName, $D; } close (FH); ' > sizeStats.pl chmod +x ./sizeStats.pl ./sizeStats.pl # If you can fill in all the numbers in this table, you are ready for # the multiple alignment procedure # featureBits chainLink measures # chainLink # N distance on hg38 on other other species # 01 0.4934 (% 35.618) (% 35.972) - Malayan flying lemur galVar1 # 02 0.5024 (% 35.372) (% 31.653) - Human hg38 # 03 0.5334 (% 25.764) (% 26.188) - Chinese tree shrew tupChi1 # None of this concern for distances matters in building the first step, the # maf files. The distances will be better calibrated later. # create species list and stripped down tree for autoMZ sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ mm10.4way.nh | xargs echo | sed 's/ //g; s/,/ /g' > tree.nh sed 's/[()]//g; s/,/ /g' tree.nh > species.list # mm10 hg38 tupChi1 galVar1 # survey N50 for each for db in `cat species.list` do n50.pl /hive/data/genomes/$db/chrom.sizes done # reading: /hive/data/genomes/mm10/chrom.sizes # contig count: 455, total size: 3209286105, one half size: 1604643052 # reading: /hive/data/genomes/mm10/chrom.sizes # contig count: 66, total size: 2730871774, one half size: 1365435887 # cumulative N50 count contig contig size 1312176979 8 chr7 145441459 1365435887 one half size 1442871972 9 chr10 130694993 # reading: /hive/data/genomes/hg38/chrom.sizes # contig count: 455, total size: 3209286105, one half size: 1604643052 # cumulative N50 count contig contig size 1547391171 8 chrX 156040895 1604643052 one half size 1692529807 9 chr8 145138636 # reading: /hive/data/genomes/tupChi1/chrom.sizes # contig count: 50750, total size: 2846580235, one half size: 1423290117 # cumulative N50 count contig contig size 1419920836 231 KB321095 3691413 1423290117 one half size 1423590960 232 KB321106 3670124 # reading: /hive/data/genomes/galVar1/chrom.sizes # contig count: 179514, total size: 3187660572, one half size: 1593830286 # cumulative N50 count contig contig size 1593691350 3422 NW_007730159v1 245222 1593830286 one half size 1593936539 3423 NW_007729331v1 245189 # bash shell syntax here ... cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way export H=/hive/data/genomes/mm10/bed mkdir mafLinks # good assemblies can use syntenic net: # hg38 for G in hg38 do mkdir mafLinks/$G echo 'ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G' ln -s ${H}/lastz.$G/mafSynNet/*.maf.gz ./mafLinks/$G done # other assemblies using recip best net: # galVar1 tupBel1 for G in galVar1 tupChi1 do mkdir mafLinks/$G echo ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G ln -s ${H}/lastz.$G/mafRBestNet/*.maf.gz ./mafLinks/$G done # verify the symLinks are good: ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | head # 52322575 Apr 10 2015 mafLinks/hg38/chr1.maf.gz # 35696060 Apr 10 2015 mafLinks/hg38/chr10.maf.gz # 36383118 Apr 10 2015 mafLinks/hg38/chr11.maf.gz ls -ogrtL mafLinks/*/* | sed -e 's/^/# /; s/-rw-rw-r-- 1//;' | tail # 3104 Mar 10 00:08 mafLinks/tupChi1/chrUn_GL456379.maf.gz # 143 Mar 10 00:08 mafLinks/tupChi1/chrUn_GL456381.maf.gz # 1221 Mar 10 00:08 mafLinks/tupChi1/chrUn_GL456382.maf.gz XXX - do not need to split - Thu Apr 20 15:02:02 PDT 2017 mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/splitRun cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/splitRun # construct a list of all possible maf file names. find ../mafLinks | grep maf.gz | sed -e 's#../mafLinks/##;' \ | xargs -L 1 basename | sed -e 's/.gz//;' | sort -u > maf.list wc -l maf.list # 52 maf.list mkdir maf run cd run mkdir penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/multiz penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/maf_project penn cp -p /cluster/bin/penn/multiz.2009-01-21_patched/autoMZ penn # set the db and pairs directories here cat > autoMultiz.csh << '_EOF_' printf '#!/bin/csh -ef set db = mm10 set c = $1 set result = $2 set run = `/bin/pwd` set tmp = /dev/shm/$db/multiz.$c set pairs = /hive/data/genomes/mm10/bed/tupChi1Multiz4way/mafLinks /bin/rm -fr $tmp /bin/mkdir -p $tmp /bin/cp -p ../../tree.nh ../../species.list $tmp pushd $tmp > /dev/null foreach s (`/bin/sed -e "s/$db //" species.list`) set in = $pairs/$s/$c set out = $db.$s.sing.maf if (-e $in.gz) then /bin/zcat $in.gz > $out if (! -s $out) then echo "##maf version=1 scoring=autoMZ" > $out endif else if (-e $in) then /bin/ln -s $in $out else echo "##maf version=1 scoring=autoMZ" > $out endif end set path = ($run/penn $path); rehash $run/penn/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c \ > /dev/null popd > /dev/null /bin/rm -f $result /bin/cp -p $tmp/$c $result /bin/rm -fr $tmp ' > autoMultiz.csh chmod +x autoMultiz.csh printf '#LOOP ./autoMultiz.csh $(file1) {check out line+ /hive/data/genomes/mm10/bed/tupChi1Multiz4way/splitRun/maf/$(root1).maf} #ENDLOOP ' > template ln -s ../maf.list maf.list ssh ku cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/splitRun/run gensub2 maf.list single template jobList para create jobList para try ... check ... push ... etc... # Completed: 52 of 52 jobs # CPU time in finished jobs: 44671s 744.52m 12.41h 0.52d 0.001 y # IO & Wait Time: 1129s 18.81m 0.31h 0.01d 0.000 y # Average job time: 881s 14.68m 0.24h 0.01d # Longest finished job: 3537s 58.95m 0.98h 0.04d # Submission to last job: 5634s 93.90m 1.56h 0.07d # combine into one file (the 1>&2 redirect sends the echo to stderr) cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way head -1 splitRun/maf/chr2.maf > tupChi1Multiz4way.maf time for F in splitRun/maf/*.maf do echo "${F}" 1>&2 egrep -v "^#" ${F} done >> tupChi1Multiz4way.maf # real 0m16.400s tail -1 splitRun/maf/chr2.maf >> tupChi1Multiz4way.maf # -rw-rw-r-- 1 5228617390 Apr 20 17:41 tupChi1Multiz4way.maf # Load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way mkdir /gbdb/mm10/tupChi1Multiz4way ln -s `pwd`/tupChi1Multiz4way.maf /gbdb/mm10/tupChi1Multiz4way cd /dev/shm time hgLoadMaf mm10 tupChi1Multiz4way # Loaded 5635229 mafs in 1 files from /gbdb/mm10/tupChi1Multiz4way # real 1m26.208s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 tupChi1Multiz4waySummary \ /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # Created 743966 summary blocks from 10080651 components and 5635229 mafs from /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # real 1m45.053s # -rw-rw-r-- 1 294659136 Apr 20 21:40 tupChi1Multiz4way.tab # -rw-rw-r-- 1 34525860 Apr 20 22:09 tupChi1Multiz4waySummary.tab wc -l tupChi1Multiz4way*.tab # 5635229 tupChi1Multiz4way.tab # 743966 tupChi1Multiz4waySummary.tab rm tupChi1Multiz4way*.tab ############################################################################## # GAP ANNOTATE MULTIZ7WAY MAF AND LOAD TABLES (DONE - 2017-04-20 - Hiram) # mafAddIRows has to be run on single chromosome maf files, it does not # function correctly when more than one reference sequence # are in a single file. Need to split of the maf file into individual # maf files mkdir -p /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno/mafSplit cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno/mafSplit time mafSplit -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../tupChi1Multiz4way.maf # real 1m25.202s find . -type f | wc -l # 52 # check for N.bed files everywhere: cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno for DB in `cat ../species.list` do if [ ! -s /hive/data/genomes/${DB}/${DB}.N.bed ]; then echo "MISS: ${DB}" # cd /hive/data/genomes/${DB} # twoBitInfo -nBed ${DB}.2bit ${DB}.N.bed else echo " OK: ${DB}" fi done cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno for DB in `cat ../species.list` do echo "${DB} " ln -s /hive/data/genomes/${DB}/${DB}.N.bed ${DB}.bed echo ${DB}.bed >> nBeds ln -s /hive/data/genomes/${DB}/chrom.sizes ${DB}.len echo ${DB}.len >> sizes done # make sure they all are successful symLinks: ls -ogrtL screen -S gapAnno # use a screen to control this longish job ssh ku cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno mkdir result find ./mafSplit -type d | sed -e 's#./mafSplit/##' | while read D do echo mkdir -p result/${D} mkdir -p result/${D} done printf '#LOOP mafAddIRows -nBeds=nBeds mafSplit/$(path1) /hive/data/genomes/mm10/mm10.2bit {check out exists+ result/$(path1)} #ENDLOOP ' > template # << happy emacs find ./mafSplit -type f | sed -e 's#^./mafSplit/##' > maf.list gensub2 maf.list single template jobList # there isn't the usual job limit problem here, only 52 jobs para create jobList para try ... check ... push ... # Completed: 52 of 52 jobs # CPU time in finished jobs: 749s 12.48m 0.21h 0.01d 0.000 y # IO & Wait Time: 119s 1.99m 0.03h 0.00d 0.000 y # Average job time: 17s 0.28m 0.00h 0.00d # Longest finished job: 65s 1.08m 0.02h 0.00d # Submission to last job: 110s 1.83m 0.03h 0.00d # verify all result files have some content, look for 0 size files: find ./result -type f -size 0 # should see none # or in this manner: find ./result -type f | xargs ls -og | sort -k3nr | tail # combine into one file (the 1>&2 redirect sends the echo to stderr) head -q -n 1 result/4/1/chrUn_GL456381.maf > mm10.4way.maf time find ./result -type f | while read F do echo "${F}" 1>&2 grep -h -v "^#" ${F} done >> mm10.4way.maf # real 0m33.237s # these maf files do not have the end marker, this does nothing: # tail -q -n 1 result/4/0/NW_007804317v1.maf >> mm10.4way.maf # How about an official end marker: echo "##eof maf" >> mm10.4way.maf ls -og # -rw-rw-r-- 1 7580362629 Apr 20 22:27 mm10.4way.maf du -hsc mm10.4way.maf # 7.1G mm10.4way.maf # construct symlinks to get the individual maf files into gbdb: rm /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # remove previous results ln -s `pwd`/mm10.4way.maf /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # Load into database cd /dev/shm time hgLoadMaf -pathPrefix=/gbdb/mm10/tupChi1Multiz4way mm10 tupChi1Multiz4way # Loaded 6931895 mafs in 1 files from /gbdb/mm10/tupChi1Multiz4way # real 1m59.548s time hgLoadMafSummary -verbose=2 -minSize=30000 \ -mergeGap=1500 -maxSize=200000 mm10 tupChi1Multiz4waySummary \ /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # Created 743966 summary blocks from 10080651 components and 6931895 mafs from /gbdb/mm10/tupChi1Multiz4way/tupChi1Multiz4way.maf # real 2m14.237s # -rw-rw-r-- 1 362918923 Apr 20 22:30 tupChi1Multiz4way.tab # -rw-rw-r-- 1 36013792 Apr 20 22:33 tupChi1Multiz4waySummary.tab rm tupChi1Multiz4way*.tab ###################################################################### # MULTIZ7WAY MAF FRAMES (DONE - 2017-04-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/frames cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/frames # survey all the genomes to find out what kinds of gene tracks they have printf '#!/bin/csh -fe foreach db (`cat ../species.list`) printf "# ${db}: " set tables = `hgsql $db -N -e "show tables" | egrep "Gene|ncbiRefSeq"` foreach table ($tables) if ($table == "ensGene" || $table == "refGene" || \ $table == "ncbiRefSeq" || $table == "mgcGenes" || \ $table == "knownGene" || $table == "xenoRefGene" ) then set count = `hgsql $db -N -e "select count(*) from $table"` echo -n "${table}: ${count}, " endif end set orgName = `hgsql hgcentraltest -N -e \ "select scientificName from dbDb where name='"'"'$db'"'"'"` set orgId = `hgsql $db -N -e \ "select id from organism where name='"'"'$orgName'"'"'"` if ($orgId == "") then echo "Mrnas: 0" else set count = `hgsql $db -N -e "select count(*) from gbCdnaInfo where organism=$orgId"` echo "Mrnas: ${count}" endif end ' > showGenes.csh chmod +x ./showGenes.csh time ./showGenes.csh # mm10: ensGene: 103734, knownGene: 63759, mgcGenes: 26777, ncbiRefSeq: 107894, refGene: 36869, xenoRefGene: 179145, Mrnas: 5367574 # hg38: ensGene: 208239, knownGene: 197782, mgcGenes: 35305, ncbiRefSeq: 159322, refGene: 69527, xenoRefGene: 184852, Mrnas: 11481766 # tupChi1: refGene: 206, xenoRefGene: 343637, Mrnas: 50709 # galVar1: ncbiRefSeq: 41547, xenoRefGene: 499145, Mrnas: 0 # real 0m41.291s # from that summary, use these gene sets: # knownGene - hg38 mm10 # ncbiRefSeq - galVar1 # xenoRefGene - tupChi1 mkdir genes # 1. knownGene: hg38 mm10 for DB in hg38 mm10 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > genes/${DB}.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # hg38: checked: 21375 failed: 0 # mm10: checked: 21100 failed: 0 # 2. xenoRefGene: tupChi1 for DB in tupChi1 do hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from xenoRefGene" ${DB} \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz printf "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # tupChi1: checked: 30481 failed: 0 # 3. ncbiRefSeq for galVar1 for DB in galVar1 do hgsql -N -e "select * from ncbiRefSeq" ${DB} | cut -f2- \ | genePredSingleCover stdin stdout | gzip -2c \ > /dev/shm/${DB}.tmp.gz mv /dev/shm/${DB}.tmp.gz genes/$DB.gp.gz echo -n "# ${DB}: " genePredCheck -db=${DB} genes/${DB}.gp.gz done # galVar1: checked: 23389 failed: 0 # verify counts for genes are reasonable: for T in genes/*.gz do echo -n "# $T: " zcat $T | cut -f1 | sort | uniq -c | wc -l done # genes/galVar1.gp.gz: 23054 # genes/hg38.gp.gz: 21375 # genes/mm10.gp.gz: 21100 # genes/tupChi1.gp.gz: 25028 time (cat ../anno/mm10.4way.maf \ | genePredToMafFrames mm10 stdin stdout \ `cat ../species.list.txt | xargs echo \ | sed -e "s#\([a-zA-Z0-9]*\)#\1 genes/\1.gp.gz#g;"` \ | gzip > tupChi1Multiz4wayFrames.bed.gz) # real 1m35.311s # verify there are frames on everything, should be 5 species: zcat tupChi1Multiz4wayFrames.bed.gz | awk '{print $4}' | sort | uniq -c \ | sed -e 's/^/# /;' # 233262 galVar1 # 231021 hg38 # 190782 mm10 # 245209 tupChi1 # load the resulting file ssh hgwdev cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/frames time hgLoadMafFrames mm10 tupChi1Multiz4wayFrames tupChi1Multiz4wayFrames.bed.gz # real 0m9.566s time featureBits -countGaps mm10 tupChi1Multiz4wayFrames # 38594412 bases of 2730871774 (1.413%) in intersection # real 0m5.681s # enable the trackDb entries: # frames tupChi1Multiz4wayFrames # irows on # appears to work OK ######################################################################### # Phylogenetic tree from 5-way (DONE - 2017-04-20 - Hiram) mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4d cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4d # using knownGene for mm10, only transcribed genes and nothing # from the randoms and other misc. hgsql -Ne "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene where cdsEnd > cdsStart;" mm10 \ | egrep -E -v "chrM|chrUn|random|_alt" > knownGene.gp wc -l *.gp # 93916 knownGene.gp # verify it is only on the chroms: cut -f2 knownGene.gp | sort | uniq -c | sort -rn | sed -e 's/^/ # /;' # 3949 chr2 # 3861 chr7 # 3496 chr11 # 2789 chr5 # 2782 chr4 # 2698 chr1 # 2585 chr9 # 2395 chr6 # 2304 chr3 # 2238 chr17 # 2206 chr8 # 2166 chr10 # 1930 chrX # 1773 chr14 # 1717 chr15 # 1654 chr13 # 1509 chr12 # 1496 chr19 # 1489 chr16 # 1125 chr18 # 193 chrY genePredSingleCover knownGene.gp stdout | sort > knownGeneNR.gp wc -l knownGeneNR.gp # 21054 knownGeneNR.gp genePredCheck -db=mm10 knownGeneNR.gp # checked: 21054 failed: 0 # the annotated maf is: og ../anno/mm10.4way.maf # -rw-rw-r-- 1 7580362629 Apr 20 22:27 ../anno/mm10.4way.maf mkdir annoSplit cd annoSplit time mafSplit -verbose=2 -outDirDepth=2 -byTarget -useFullSequenceName \ /dev/null . ../../anno/mm10.4way.maf # real 2m13.529s find . -type f | wc -l # 52 ssh ku mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4d/run cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4d/run mkdir ../mfa # newer versions of msa_view have a slightly different operation # the sed of the gp file inserts the reference species in the chr name printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set GP = knownGeneNR.gp set r = "/hive/data/genomes/mm10/bed/tupChi1Multiz4way" set c = $1:r set infile = $r/4d/annoSplit/$2 set outDir = $r/4d/mfa/$3:h set outfile = $r/4d/mfa/$3 /bin/mkdir -p $outDir cd /dev/shm /bin/awk -v C=$c '"'"'$2 == C {print}'"'"' $r/4d/$GP | sed -e "s/\\t$c\\t/\\tmm10.$c\\t/" > $c.gp set NL=`wc -l $c.gp| gawk '"'"'{print $1}'"'"'` echo $NL if ("$NL" != "0") then $PHASTBIN/msa_view --4d --features $c.gp -i MAF $infile -o SS > $c.ss $PHASTBIN/msa_view -i SS --tuple-size 1 $c.ss > $outfile else echo "" > $outfile endif /bin/rm -f /dev/shm/$c.gp /dev/shm/$c.ss ' > 4d.csh chmod +x 4d.csh find ../annoSplit -type f | sed -e "s#../annoSplit/##" > maf.list wc -l maf.list # 52 maf.list printf '#LOOP 4d.csh $(file1) $(path1) {check out line+ ../mfa/$(dir1)/$(dir2)$(root1).mfa} #ENDLOOP ' > template gensub2 maf.list single template jobList # do not have the usual problem with fast jobs here, only 52 of them total para create jobList para try ... check para time # Completed: 52 of 52 jobs # CPU time in finished jobs: 615s 10.26m 0.17h 0.01d 0.000 y # IO & Wait Time: 122s 2.03m 0.03h 0.00d 0.000 y # Average job time: 14s 0.24m 0.00h 0.00d # Longest finished job: 50s 0.83m 0.01h 0.00d # Submission to last job: 86s 1.43m 0.02h 0.00d # Not all results have contents, or finish successfully, that is OK # it is because not all contigs have genes, only gene sequences are measured # combine mfa files ssh hgwdev cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4d # remove the broken empty files, size 0 and size 1: find ./mfa -type f -size 0 | xargs rm -f # sometimes this doesn't work, don't know why find ./mfa -type f -size 1 | xargs rm -f # when it doesn't, use this empty list procedure find ./mfa -type f | xargs ls -og | awk '$3 < 2' | awk '{print $NF}' \ > empty.list cat empty.list | xargs rm -f # see what is left: ls -ogrt mfa/*/*/*.mfa | sort -k3nr | wc # 21 147 1081 # want comma-less species.list time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_view \ --aggregate "`cat ../species.list`" mfa/*/*/*.mfa | sed s/"> "/">"/ \ > 4d.all.mfa # real 0m1.256s # check they are all in there: grep "^>" 4d.all.mfa | wc -l # 4 grep "^>" 4d.all.mfa | sed -e 's/^/# /;' # >mm10 # >hg38 # >tupChi1 # >galVar1 sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \ ../mm10.4way.nh | xargs echo | sed -e 's/ //g' > tree_commas.nh # tree_commas.nh looks like: # (mm10,(hg38,(tupChi1,galVar1))) # use phyloFit to create tree model (output is phyloFit.mod) time /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/phyloFit \ --EM --precision MED --msa-format FASTA --subst-mod REV \ --tree tree_commas.nh 4d.all.mfa # real 0m0.727s mv phyloFit.mod all.mod grep TREE all.mod # TREE: # (mm10:0.170506,(hg38:0.114771, # (tupChi1:0.187178,galVar1:0.105148):0.011794):0.170506); # compare these calculated lengths to the tree extracted from 191way: grep TREE all.mod | sed -e 's/TREE: //' \ | /cluster/bin/phast/all_dists /dev/stdin | grep mm10 \ | sed -e "s/mm10.//;" | sort > new.dists /cluster/bin/phast/all_dists ../mm10.4way.nh | grep mm10 \ | sed -e "s/mm10.//;" | sort > old.dists # printing out the 'new', the 'old' the 'difference' and percent difference join new.dists old.dists | awk '{ printf "#\t%s\t%8.6f\t%8.6f\t%8.6f\t%8.6f\n", $1, $2, $3, $2-$3, 100*($2-$3)/$3 }' \ | sort -k3n # hg38 0.455783 0.502391 -0.046608 -9.277236 # galVar1 0.457954 0.493420 -0.035466 -7.187791 # tupChi1 0.539984 0.533420 0.006564 1.230550 ######################################################################### # phastCons 5-way (DONE - 2017-04-20 - Hiram) # split 4way mafs into 10M chunks and generate sufficient statistics # files for # phastCons ssh ku mkdir -p /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/SS cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/SS mkdir result done printf '#!/bin/csh -ef set d = $1 set c = $2 set doneDir = done/$d set MAF = /hive/data/genomes/mm10/bed/tupChi1Multiz4way/anno/result/$d/$c.maf set WINDOWS = /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/SS/result/$d/$c set WC = `cat $MAF | wc -l` set NL = `grep "^#" $MAF | wc -l` if ( -s $3 ) then exit 0 endif if ( -s $3.running ) then exit 0 endif /bin/mkdir -p $doneDir /bin/date >> $3.running /bin/rm -fr $WINDOWS /bin/mkdir -p $WINDOWS pushd $WINDOWS > /dev/null if ( $WC != $NL ) then /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/msa_split \\ $MAF -i MAF -o SS -r $WINDOWS/$c -w 10000000,0 -I 1000 -B 5000 endif popd > /dev/null /bin/date >> $3 /bin/rm -f $3.running ' > mkSS.csh chmod +x mkSS.csh printf '#LOOP mkSS.csh $(dir1) $(root1) {check out line+ done/$(dir1)/$(root1)} #ENDLOOP ' > template find ../../anno/result -type f | sed -e "s#../../anno/result/##" > maf.list wc -l maf.list # 52 maf.list ssh ku cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/SS gensub2 maf.list single template jobList # no need to worry about fast jobs, only 52 jobs here para create jobList para try ... check ... etc para push # Completed: 52 of 52 jobs # CPU time in finished jobs: 1064s 17.74m 0.30h 0.01d 0.000 y # IO & Wait Time: 180s 3.00m 0.05h 0.00d 0.000 y # Average job time: 24s 0.40m 0.01h 0.00d # Longest finished job: 89s 1.48m 0.02h 0.00d # Submission to last job: 127s 2.12m 0.04h 0.00d find ./result -type f | wc -l # 290 # Run phastCons # This job is I/O intensive in its output files, beware where this # takes place or do not run too many at once. ssh ku mkdir -p /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/run.cons cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/run.cons # This is setup for multiple runs based on subsets, but only running # the 'all' subset here. # It triggers off of the current working directory # $cwd:t which is the "grp" in this script. Running: # all and vertebrates printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set c = $1 set d = $2 set f = $3 set len = $4 set cov = $5 set rho = $6 set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons set tmp = $cons/tmp/${d}_${c} mkdir -p $tmp set ssSrc = $cons/SS/result set useGrp = "$grp.mod" if (-s $cons/$grp/$grp.non-inf) then ln -s $cons/$grp/$grp.mod $tmp ln -s $cons/$grp/$grp.non-inf $tmp ln -s $ssSrc/$d/$f $tmp else ln -s $ssSrc/$d/$f $tmp ln -s $cons/$grp/$grp.mod $tmp endif pushd $tmp > /dev/null if (-s $grp.non-inf) then $PHASTBIN/phastCons $f $useGrp \ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --not-informative `cat $grp.non-inf` \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp else $PHASTBIN/phastCons $f $useGrp \\ --rho $rho --expected-length $len --target-coverage $cov --quiet \\ --seqname $c --idpref $c --most-conserved $c.bed --score > $c.pp endif popd > /dev/null mkdir -p pp/$d bed/$d sleep 4 touch pp/$d bed/$d rm -f pp/$d/$c.pp rm -f bed/$d/$c.bed mv $tmp/$c.pp pp/$d mv $tmp/$c.bed bed/$d rm -fr $tmp rmdir --ignore-fail-on-non-empty $cons/tmp/$d:h ' > doPhast.csh chmod +x doPhast.csh # this template will serve for all runs # root1 == chrom name, file1 == ss file name without .ss suffix printf '#LOOP ../run.cons/doPhast.csh $(root1) $(dir1) $(file1) 45 0.3 0.3 {check out line+ pp/$(dir1)/$(root1).pp} #ENDLOOP ' > template find ../SS/result -type f | sed -e "s#../SS/result/##" > ss.list wc -l ss.list # 290 ss.list # Create parasol batch and run it # run for all species cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons mkdir -p all cd all # Using the .mod tree cp -p ../../4d/all.mod ./all.mod gensub2 ../run.cons/ss.list single ../run.cons/template jobList para -maxJob=100 create jobList para try ... check ... para push # Completed: 290 of 290 jobs # CPU time in finished jobs: 5576s 92.93m 1.55h 0.06d 0.000 y # IO & Wait Time: 1995s 33.25m 0.55h 0.02d 0.000 y # Average job time: 26s 0.44m 0.01h 0.00d # Longest finished job: 33s 0.55m 0.01h 0.00d # Submission to last job: 66s 1.10m 0.02h 0.00d # create Most Conserved track cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/all time cut -f1 ../../../../chrom.sizes | while read C do ls -d bed/?/?/${C} 2> /dev/null | while read D do echo ${D}/${C}*.bed 1>&2 cat ${D}/${C}*.bed done | sort -k1,1 -k2,2n \ | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", "'${C}'", $2, $3, $5, $5;}' done > tmpMostConserved.bed # real 0m12.570s time /cluster/bin/scripts/lodToBedScore tmpMostConserved.bed \ > mostConserved.bed # real 0m7.235s # -rw-rw-r-- 1 28670932 Apr 21 00:01 tmpMostConserved.bed # -rw-rw-r-- 1 29438194 Apr 21 00:02 mostConserved.bed # load into database ssh hgwdev cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/all time hgLoadBed mm10 tupChi1PhastConsElements4way mostConserved.bed # Read 841312 elements of size 5 from mostConserved.bed # real 0m7.635s # on human we often try for 5% overall cov, and 70% CDS cov # most bets are off here for that goal, these alignments are too few # and too far between # --rho 0.3 --expected-length 45 --target-coverage 0.3 time featureBits mm10 -enrichment knownGene:cds tupChi1PhastConsElements4way # knownGene:cds 1.333%, tupChi1PhastConsElements4way 4.368%, both 0.924%, # cover 69.30%, enrich 15.86x # real 0m8.883s # Create merged posterier probability file and wiggle track data files cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/all mkdir downloads # the third sed fixes the chrom names, removing the partition extensions time (find ./pp -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | sed -e 's/\.[0-9][0-9]*-[0-9][0-9]* start/ start/' \ | gzip -c > downloads/phastCons4way.wigFix.gz) # real 13m32.808s # -rw-rw-r-- 1 1452731444 Apr 21 00:18 phastCons4way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phastCons4way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/mm10/chrom.sizes \ phastCons4way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=19728: VmPeak: 12564976 kB # real 17m36.198s bigWigInfo phastCons4way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 2,285,833,964 # primaryIndexSize: 63,248,068 # zoomLevels: 10 # chromCount: 37 # basesCovered: 1,155,614,560 # mean: 0.166872 # min: 0.000000 # max: 1.000000 # std: 0.286694 # encode those files into wiggle data time (zcat downloads/phastCons4way.wigFix.gz \ | wigEncode stdin phastCons4way.wig phastCons4way.wib) # Converted stdin, upper limit 1.00, lower limit 0.00 # real 6m26.433s du -hsc *.wi? # 1.1G phastCons4way.wib # 184M phastCons4way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phastCons4way.wib /gbdb/mm10/tupChi1Multiz4way/phastCons4way.wib time hgLoadWiggle -pathPrefix=/gbdb/mm10/tupChi1Multiz4way \ mm10 tupChi1PhastCons4way phastCons4way.wig # real 0m22.540s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly time wigTableStats.sh mm10 tupChi1PhastCons4way # db.table min max mean count sumData # mm10.tupChi1PhastCons4way 0 1 0.166872 1155614560 1.9284e+08 # stdDev viewLimits # 0.286694 viewLimits=0:1 # real 0m9.615s # Create histogram to get an overview of all the data time hgWiggle -doHistogram -db=mm10 \ -hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \ tupChi1PhastCons4way > histogram.data 2>&1 # real 1m9.916s # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse mm10 Histogram tupChi1PhastCons4way track" set xlabel " phastCons4way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set yrange [0:0.02] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \\ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & ######################################################################### # phyloP for 5-way (DONE - 2017-04-20 - Hiram) # run phyloP with score=LRT ssh ku mkdir /cluster/data/mm10/bed/tupChi1Multiz4way/consPhyloP cd /cluster/data/mm10/bed/tupChi1Multiz4way/consPhyloP mkdir run.phyloP cd run.phyloP # Adjust model file base composition background and rate matrix to be # representative of the chromosomes in play grep BACKGROUND ../../4d/all.mod | awk '{printf "%0.3f\n", $3 + $4}' # 0.571 /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin/modFreqs \ ../../4d/all.mod 0.571 > all.mod # verify, the BACKGROUND should now be paired up: grep BACK all.mod # BACKGROUND: 0.219000 0.281000 0.281000 0.219000 printf '#!/bin/csh -fe set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2010-12-30/bin set f = $1 set d = $f:h set file1 = $f:t set out = $2 set cName = $f:t:r set grp = $cwd:t set cons = /hive/data/genomes/mm10/bed/tupChi1Multiz4way/consPhyloP set tmp = $cons/tmp/$grp/$f /bin/rm -fr $tmp /bin/mkdir -p $tmp set ssSrc = "/hive/data/genomes/mm10/bed/tupChi1Multiz4way/cons/SS/result/$f" set useGrp = "$grp.mod" /bin/ln -s $cons/run.phyloP/$grp.mod $tmp pushd $tmp > /dev/null $PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \\ -i SS $useGrp $ssSrc.ss > $file1.wigFix popd > /dev/null /bin/mkdir -p $out:h sleep 4 /bin/touch $out:h /bin/mv $tmp/$file1.wigFix $out /bin/rm -fr $tmp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp/$d:h /bin/rmdir --ignore-fail-on-non-empty $cons/tmp/$grp /bin/rmdir --ignore-fail-on-non-empty $cons/tmp ' > doPhyloP.csh chmod +x doPhyloP.csh # Create list of chunks find ../../cons/SS/result -type f | grep ".ss$" \ | sed -e "s/.ss$//; s#^../../cons/SS/result/##" > ss.list # make sure the list looks good wc -l ss.list # 290 ss.list # Create template file # file1 == $chr/$chunk/file name without .ss suffix printf '#LOOP ../run.phyloP/doPhyloP.csh $(path1) {check out line+ wigFix/$(dir1)/$(file1).wigFix} #ENDLOOP ' > template ###################### Running all species ####################### # setup run for all species mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/consPhyloP/all cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/consPhyloP/all rm -fr wigFix mkdir wigFix gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList # beware overwhelming the cluster with these fast running high I/O jobs para create jobList para try ... check ... push ... etc ... para -maxJob=53 push para time > run.time # Completed: 290 of 290 jobs # CPU time in finished jobs: 1042s 17.37m 0.29h 0.01d 0.000 y # IO & Wait Time: 2008s 33.47m 0.56h 0.02d 0.000 y # Average job time: 11s 0.18m 0.00h 0.00d # Longest finished job: 22s 0.37m 0.01h 0.00d # Submission to last job: 84s 1.40m 0.02h 0.00d mkdir downloads time (find ./wigFix -type f | sed -e "s#^./##; s#\.# d #g; s#-# m #;" \ | sort -k1,1 -k3,3n | sed -e "s# d #.#g; s# m #-#g;" | xargs cat \ | gzip -c > downloads/phyloP4way.wigFix.gz) # real 12m14.234s # -rw-rw-r-- 1 1357982519 Apr 21 12:39 phyloP4way.wigFix.gz # check integrity of data with wigToBigWig time (zcat downloads/phyloP4way.wigFix.gz \ | wigToBigWig -verbose=2 stdin /hive/data/genomes/mm10/chrom.sizes \ phyloP4way.bw) > bigWig.log 2>&1 egrep "real|VmPeak" bigWig.log # pid=77432: VmPeak: 12564972 kB # real 17m47.787s bigWigInfo phyloP4way.bw | sed -e 's/^/# /;' # version: 4 # isCompressed: yes # isSwapped: 0 # primaryDataSize: 1,672,367,975 # primaryIndexSize: 63,248,068 # zoomLevels: 10 # chromCount: 37 # basesCovered: 1,155,614,560 # mean: 0.108291 # min: -2.306000 # max: 0.719000 # std: 0.585706 # encode those files into wiggle data time (zcat downloads/phyloP4way.wigFix.gz \ | wigEncode stdin phyloP4way.wig phyloP4way.wib) # Converted stdin, upper limit 0.72, lower limit -2.31 # real 6m41.352s du -hsc *.wi? # 1.1G phyloP4way.wib # 188M phyloP4way.wig # Load gbdb and database with wiggle. ln -s `pwd`/phyloP4way.wib /gbdb/mm10/tupChi1Multiz4way/phyloP4way.wib time hgLoadWiggle -pathPrefix=/gbdb/mm10/tupChi1Multiz4way mm10 \ tupChi1PhyloP4way phyloP4way.wig # real 0m22.598s # use to set trackDb.ra entries for wiggle min and max # and verify table is loaded correctly wigTableStats.sh mm10 tupChi1PhyloP4way # db.table min max mean count sumData # mm10.tupChi1PhyloP4way -2.306 0.719 0.108291 1155614560 1.25143e+08 # stdDev viewLimits # 0.585706 viewLimits=-2.306:0.719 # that range is: 0.719+2.306 = 3.025 for hBinSize=0.003025 # Create histogram to get an overview of all the data time hgWiggle -doHistogram \ -hBinSize=0.003025 -hBinCount=1000 -hMinVal=-2.306 -verbose=2 \ -db=mm10 tupChi1PhyloP4way > histogram.data 2>&1 # real 1m4.763s # find the Y range for the 2:5 graph grep -v chrom histogram.data | grep "^[0-9]" | ave -col=5 stdin \ | sed -e 's/^/# /;' # Q1 0.000068 # median 0.000261 # Q3 0.001051 # average 0.001280 # min 0.000000 # max 0.075274 # count 781 # total 1.000007 # standard deviation 0.003947 # find the X range for the 2:5 graph grep "^[0-9]" histogram.data | ave -col=2 stdin \ | sed -e 's/^/# /;' # Q1 -1.558820 # median -0.965925 # Q3 -0.366975 # average -0.917927 # min -2.306000 # max 0.719000 # count 781 # total -716.901065 # standard deviation 0.798757 # create plot of histogram: printf 'set terminal png small x000000 xffffff xc000ff x66ff66 xffff00 x00ffff font \ "/usr/share/fonts/default/Type1/n022004l.pfb" set size 1.4, 0.8 set key left box set grid noxtics set grid ytics set title " Mouse mm10 Histogram tupChi1PhyloP4way track" set xlabel " phyloP4way score" set ylabel " Relative Frequency" set y2label " Cumulative Relative Frequency (CRF)" set y2range [0:1] set y2tics set xtics set xrange [-2.6:0.85] set yrange [0:0.033] plot "histogram.data" using 2:5 title " RelFreq" with impulses, \ "histogram.data" using 2:7 axes x1y2 title " CRF" with lines ' | gnuplot > histo.png display histo.png & # appears to have an odd hole in the data just past X=0 ? ############################################################################# # hgPal downloads (DONE - 2017-04-21 - Hiram) # FASTA from 5-way for knownGene, refGene and knownCanonical ssh hgwdev screen -S mm10HgPal mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/pal cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/pal cat ../species.list | tr '[ ]' '[\n]' > order.list # this for loop takes about 2.5 hours on this large count contig assembly export mz=tupChi1Multiz4way export gp=knownGene export db=mm10 export I=0 export D=0 mkdir exonAA exonNuc printf '#!/bin/sh\n' > $gp.jobs time for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` D=`echo $D | awk '{print $1+1}'` dNum=`echo $D | awk '{printf "%03d", int($1/1000)}'` mkdir -p exonNuc/${dNum} > /dev/null mkdir -p exonAA/${dNum} > /dev/null echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/${dNum}/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/${dNum}/$C.exonAA.fa.gz &" if [ $I -gt 16 ]; then echo "date" echo "wait" I=0 fi done >> $gp.jobs # real 0m0.772s echo "date" >> $gp.jobs echo "wait" >> $gp.jobs chmod +x knownGene.jobs time (./$gp.jobs) > $gp.jobs.log 2>&1 & # real 11m18.851s export mz=multiz4way export gp=knownGene time find ./exonAA -type f | grep exonAA.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonAA.fa.gz # real 0m8.492s time find ./exonNuc -type f | grep exonNuc.fa.gz | xargs zcat \ | gzip -c > $gp.$mz.exonNuc.fa.gz # real 0m39.199s # -rw-rw-r-- 1 33908467 Apr 21 18:49 knownGene.multiz4way.exonAA.fa.gz # -rw-rw-r-- 1 55392688 Apr 21 18:49 knownGene.multiz4way.exonNuc.fa.gz export mz=multiz4way export gp=knownGene export db=mm10 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd md5sum *.fa.gz > md5sum.txt ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ln -s `pwd`/md5sum.txt $pd/ rm -rf exonAA exonNuc ############################################################################# # construct download files for 5-way (DONE - 2017-04-21 - Hiram) mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz4way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons4way mkdir /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP4way mkdir /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads mkdir multiz4way phastCons4way phyloP4way cd multiz4way time cp -p ../../anno/mm10.4way.maf . # real 0m15.285s # -rw-rw-r-- 1 7580362629 Apr 20 22:27 mm10.4way.maf du -hsc * # 7.1G mm10.4way.maf time gzip *.maf # real 27m2.122s # -rw-rw-r-- 1 2040574809 Apr 20 22:27 mm10.4way.maf.gz du -hsc *.maf.gz # 2.0G mm10.4way.maf.gz ########################################################################### ## create upstream refGene maf files cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/tupChi1Multiz4way # bash script #!/bin/sh export geneTbl="knownGene" for S in 1000 2000 5000 do echo "making upstream${S}.maf" featureBits mm10 ${geneTbl}:upstream:${S} -fa=/dev/null -bed=stdout \ | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \ | /cluster/bin/$MACHTYPE/mafFrags mm10 tupChi1Multiz4way \ stdin stdout \ -orgs=/hive/data/genomes/mm10/bed/tupChi1Multiz4way/species.list \ | gzip -c > upstream${S}.${geneTbl}.maf.gz echo "done upstream${S}.${geneTbl}.maf.gz" done # real 12m55.050s md5sum *.maf.gz *.nh upstream*.gz README.txt >> md5sum.txt # some other symlinks were already made above # obtain the README.txt from tupChi1/multiz4way and update for this # situation ln -s `pwd`/upstream*.gz `pwd`/README.txt \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/tupChi1Multiz4way grep TREE ../../4d/all.mod | awk '{print $NF}' \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.nh ~/kent/src/hg/utils/phyloTrees/commonNames.sh mm10.4way.nh \ | ~/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.commonNames.nh ~/kent/src/hg/utils/phyloTrees/scientificNames.sh mm10.4way.nh \ | $HOME/kent/src/hg/utils/phyloTrees/asciiTree.pl /dev/stdin \ > mm10.4way.scientificNames.nh time md5sum *.nh *.maf.gz > md5sum.txt # real 0m35.144s ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/multiz4way du -hsc *.maf.gz ../../anno/mm10.4way.maf # 3.0G mm10.4way.maf.gz # 13G ../../anno/mm10.4way.maf # obtain the README.txt from tupChi1/multiz4way and update for this # situation ##################################################################### cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/phastCons4way ln -s ../../cons/all/downloads/phastCons4way.wigFix.gz \ ./mm10.phastCons4way.wigFix.gz ln -s ../../cons/all/phastCons4way.bw ./mm10.phastCons4way.bw ln -s ../../cons/all/all.mod ./mm10.phastCons4way.mod time md5sum *.gz *.mod *.bw > md5sum.txt # real 0m20.354s # obtain the README.txt from tupChi1/phastCons4way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phastCons4way ##################################################################### cd /hive/data/genomes/mm10/bed/tupChi1Multiz4way/downloads/phyloP4way ln -s ../../consPhyloP/all/downloads/phyloP4way.wigFix.gz \ ./mm10.phyloP4way.wigFix.gz ln -s ../../consPhyloP/run.phyloP/all.mod mm10.phyloP4way.mod ln -s ../../consPhyloP/all/phyloP4way.bw mm10.phyloP4way.bw time md5sum *.mod *.bw *.gz > md5sum.txt # real 0m12.264s # obtain the README.txt from tupChi1/phyloP4way and update for this # situation ln -s `pwd`/* \ /usr/local/apache/htdocs-hgdownload/goldenPath/mm10/phyloP4way ############################################################################# # wiki page for 5-way (DONE - 2017-04-21 - Hiram) mkdir /hive/users/hiram/bigWays/mm10.4way cd /hive/users/hiram/bigWays echo "mm10" > mm10.4way/ordered.list awk '{print $1}' /hive/data/genomes/mm10/bed/tupChi1Multiz4way/4way.distances.txt \ >> mm10.4way/ordered.list # sizeStats.sh catches up the cached measurements required for data # in the tables. They are usually already mostly done, only new # assemblies will have updates. ./sizeStats.sh mm10.4way/ordered.list # dbDb.sh constructs mm10.4way/GalVar1_5-way_conservation_alignment.html # may need to add new assembly references to srcReference.list and # urlReference.list ./dbDb.sh mm10 4way # sizeStats.pl constructs mm10.4way/GalVar1_5-way_Genome_size_statistics.html # this requires entries in coverage.list for new sequences ./sizeStats.pl mm10 4way # defCheck.pl constructs GalVar1_5-way_conservation_lastz_parameters.html ./defCheck.pl mm10 4way # this constructs the html pages in mm10.4way/: # -rw-rw-r-- 1 2800 Apr 21 21:22 Mm10_4-way_conservation_alignment.html # -rw-rw-r-- 1 4199 Apr 21 21:22 Mm10_4-way_Genome_size_statistics.html # -rw-rw-r-- 1 2995 Apr 21 21:22 Mm10_4-way_conservation_lastz_parameters.html # add those pages to the genomewiki. Their page names are the # names of the .html files without the .html: # Mm10_4-way_conservation_alignment # Mm10_4-way_Genome_size_statistics # Mm10_4-way_conservation_lastz_parameters # when you view the first one you enter, it will have links to the # missing two. ############################################################################## # LASTZ Chinese hamster criGri1 (DONE - 2017-05-12 - Hiram) # establish a screen to control this job screen -S mm10criGri1 mkdir /hive/data/genomes/mm10/bed/lastzCriGri1.2017-05-12 cd /hive/data/genomes/mm10/bed/lastzCriGri1.2017-05-12 printf '# mouse vs. Chinese hamster BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chinese hamster criGri1 SEQ2_DIR=/hive/data/genomes/criGri1/criGri1.2bit SEQ2_LEN=/hive/data/genomes/criGri1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCriGri1.2017-05-12 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 289m42.628s cat fb.mm10.chainCriGri1Link.txt # 1577848220 bases of 2652783500 (59.479%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 criGri1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 797m59.816s mkdir /hive/data/genomes/criGri1/bed/blastz.mm10.swap cd /hive/data/genomes/criGri1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCriGri1.2017-05-12/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 172m50.552s cat fb.criGri1.chainMm10Link.txt # 1589449878 bases of 2301325917 (69.067%) in intersection time (doRecipBest.pl -workhorse=hgwdev criGri1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 846m34.982s ############################################################################## # ncbiRefSeq composite gene track (DONE - 2017-05-26 - Hiram) mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p5 cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p5 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -bigClusterHub=ku -dbHost=hgwdev \ -stop=download -fileServer=hgwdev -smallClusterHub=ku -workhorse=hgwdev \ refseq vertebrate_mammalian Mus_musculus \ GCF_000001635.25_GRCm38.p5 mm10 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -debug -bigClusterHub=ku -dbHost=hgwdev \ -continue=process -stop=process -fileServer=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev refseq vertebrate_mammalian Mus_musculus \ GCF_000001635.25_GRCm38.p5 mm10 ~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ -debug -bigClusterHub=ku -dbHost=hgwdev \ -continue=load -stop=load -fileServer=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev refseq vertebrate_mammalian Mus_musculus \ GCF_000001635.25_GRCm38.p5 mm10 # There are some ncRNAs missing faSize -detailed mm10.rna.fa \ | pslCheck -querySizes=stdin -targetSizes=../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl # checked: 85224 failed: 18 errors: 18 # and joinerCheck is not completely clean: joinerCheck -identifier=ncbiRefSeq -keys -database=mm10 all.joiner Checking keys on database mm10 mm10.ncbiRefSeqLink.id - hits 107479 of 107479 (100.000%) ok mm10.ncbiRefSeqCurated.name - hits 32217 of 32217 (100.000%) ok mm10.ncbiRefSeqPredicted.name - hits 52989 of 52989 (100.000%) ok mm10.ncbiRefSeqPsl.qName - hits 85206 of 85224 (99.979%) Error: 18 of 85224 elements (0.021%) of mm10.ncbiRefSeqPsl.qName are not in key ncbiRefSeq.name line 6045 of all.joiner Example miss: NR_033199.1 mm10.ncbiRefSeqCds.id - hits 76076 of 76076 (100.000%) ok mm10.seqNcbiRefSeq.acc - hits 85205 of 85205 (100.000%) ok # The reason for these difficulties is because some of the original # GFF items were dropped due to unprocessedRoots. The fix is to eliminate # the rest of these unprocessedRoots from PSL loaded file. # discovered that it didn't help to add them in, (procedure included below) # then featureBits went bad: joinerCheck -identifier=ncbiRefSeq -keys -database=mm10 all.joiner Checking keys on database mm10 mm10.ncbiRefSeqLink.id - hits 107479 of 107479 (100.000%) ok mm10.ncbiRefSeqCurated.name - hits 32217 of 32217 (100.000%) ok mm10.ncbiRefSeqPredicted.name - hits 52989 of 52989 (100.000%) ok mm10.ncbiRefSeqPsl.qName - hits 85206 of 85224 (99.979%) Error: 18 of 85224 elements (0.021%) of mm10.ncbiRefSeqPsl.qName are not in key ncbiRefSeq.name line 6045 of all.joiner Example miss: NR_033199.1 mm10.ncbiRefSeqCds.id - hits 76076 of 76076 (100.000%) ok mm10.seqNcbiRefSeq.acc - hits 85205 of 85222 (99.980%) Error: 17 of 85222 elements (0.020%) of mm10.seqNcbiRefSeq.acc are not in key ncbiRefSeq.name line 6047 of all.joiner Example miss: NR_015480.1 # eliminate items from PSL file, # compare name lists: hgsql -N -e 'select qName from ncbiRefSeqPsl;' mm10 \ | sort -u > ncbiRefSeqPsl.qName hgsql -N -e 'select name from ncbiRefSeq;' mm10 \ | sort -u > ncbiRefSeq.name wc -l ncbiRefSeqPsl.qName ncbiRefSeq.name # 85220 ncbiRefSeqPsl.qName # 107479 ncbiRefSeq.name comm -12 ncbiRefSeqPsl.qName ncbiRefSeq.name | wc -l # 85203 # need to eliminate 17 items from the PSL track: comm -23 ncbiRefSeqPsl.qName ncbiRefSeq.name | wc -l # 17 comm -23 ncbiRefSeqPsl.qName ncbiRefSeq.name | while read N do hgsql -e "select * from ncbiRefSeqPsl where qName=\"$N\";" mm10 done | wc -l # 35 comm -23 ncbiRefSeqPsl.qName ncbiRefSeq.name | while read N do hgsql -e "delete from ncbiRefSeqPsl where qName=\"$N\";" mm10 done hgsql -N -e 'select qName from ncbiRefSeqPsl;' mm10 \ | sort -u > ncbiRefSeqPsl.clean.qName wc -l ncbiRefSeqPsl.clean.qName ncbiRefSeq.name comm -12 ncbiRefSeqPsl.clean.qName ncbiRefSeq.name | wc -l # 85203 ncbiRefSeqPsl.clean.qName # 107479 ncbiRefSeq.name comm -12 ncbiRefSeqPsl.clean.qName ncbiRefSeq.name | wc -l # 85203 # joinerCheck is now clean joinerCheck -identifier=ncbiRefSeq -keys -database=mm10 all.joiner Checking keys on database mm10 mm10.ncbiRefSeqLink.id - hits 107479 of 107479 (100.000%) ok mm10.ncbiRefSeqCurated.name - hits 32217 of 32217 (100.000%) ok mm10.ncbiRefSeqPredicted.name - hits 52989 of 52989 (100.000%) ok mm10.ncbiRefSeqPsl.qName - hits 85206 of 85206 (100.000%) ok mm10.ncbiRefSeqCds.id - hits 76076 of 76076 (100.000%) ok mm10.seqNcbiRefSeq.acc - hits 85205 of 85205 (100.000%) ok # and pslCheck is now clean: faSize -detailed /gbdb/mm10/ncbiRefSeq/seqNcbiRefSeq.rna.fa \ | pslCheck -querySizes=stdin -targetSizes=../../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl # checked: 85206 failed: 0 errors: 0 hgsql -N -e 'select acc,size from seqNcbiRefSeq;' mm10 \ | pslCheck -querySizes=stdin -targetSizes=../../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl # checked: 85206 failed: 0 errors: 0 ### update hgFixed.trackVersion hgsql -e 'update trackVersion set version="2016-12-16" where ix=1706;' hgFixed ### XXX obsolete procedure that does not fix the problem mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p5/missingRna cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p5/missingRna # determine missing sequences ids faSize -detailed ../mm10.rna.fa \ | pslCheck -querySizes=stdin -targetSizes=../../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl > pslCheck.ncbiRefSeq.rna.fa.txt 2>&1 egrep -v "does not exist|errors:" pslCheck.ncbiRefSeq.rna.fa.txt \ | awk '{printf "%s\t%s\n", $5,$4}' | sort -u > idWithRange.seqListFile.tab # fetch RNA sequences from entrez: mkdir ncbiRna cut -f2 idWithRange.seqListFile.tab | sed -e 's#:[0-9]\+-[0-9]\+##;' \ | while read id do wget -O /dev/stdout \ "http://www.ncbi.nlm.nih.gov/sviewer/viewer.fcgi?db=nuccore&dopt=fasta&sendto=on&id=$id" \ | sed -e 's/ Mus musculus .*//;' | sed -e '/^$/d' > ncbiRna/$id.fa done fi cat ../mm10.rna.fa ncbiRna > mm10.seqNcbiRefSeq.rna.fa rm -f /gbdb/mm10/ncbiRefSeq/seqNcbiRefSeq.rna.fa ln -s `pwd`/mm10.seqNcbiRefSeq.rna.fa \ /gbdb/mm10/ncbiRefSeq/seqNcbiRefSeq.rna.fa hgLoadSeq -drop -seqTbl=seqNcbiRefSeq -extFileTbl=extNcbiRefSeq mm10 \ /gbdb/mm10/ncbiRefSeq/seqNcbiRefSeq.rna.fa # now have clean pslCheck, verify both with the file and the seq table: faSize -detailed /gbdb/mm10/ncbiRefSeq/seqNcbiRefSeq.rna.fa \ | pslCheck -querySizes=stdin -targetSizes=../../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl # checked: 85224 failed: 0 errors: 0 hgsql -N -e 'select acc,size from seqNcbiRefSeq;' mm10 \ | pslCheck -querySizes=stdin -targetSizes=../../../chrom.sizes \ -db=mm10 ncbiRefSeqPsl # checked: 85224 failed: 0 errors: 0 ############################################################################## 2017-05-31: import of UCSC GENCODE group processing of GENCODE VM14 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM14 pushd /hive/data/genomes/mm10/bed/gencodeVM14 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M14 89 'May 2017' # edit mouse/mm10/trackDb.wgEncode.ra to add new .ra file include make DBS=mm10 # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit all.joiner to add ~/tmp/gencodeVM14.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM14 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all ############################################################################## # LASTZ zebrafish danRer11 (DONE - 2017-06-12 - Chris) # establish a screen to control this job screen -S mm10danRer11 mkdir /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 cd /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 printf '# mouse vs. zebrafish BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: zebrafish danRer11 SEQ2_DIR=/hive/data/genomes/danRer11/danRer11.2bit SEQ2_LEN=/hive/data/genomes/danRer11/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 289m42.628s cat fb.mm10.chainDanRer11Link.txt # 36448414 bases of 2652783500 (1.374%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 danRer11 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & mkdir /hive/data/genomes/danRer11/bed/blastz.mm10.swap cd /hive/data/genomes/danRer11/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDanRer11.2017-06-12/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 172m50.552s cat fb.danRer11.chainMm10Link.txt # 45558857 bases of 1674677181 (2.720%) in intersection 1589449878 bases of 2301325917 (69.067%) in intersection time (doRecipBest.pl -workhorse=hgwdev danRer11 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 846m34.982s ############################################################################## # LASTZ Killer whale orcOrc1 (DONE - 2017-06-15 - Hiram) # establish a screen to control this job screen -S mm10orcOrc1 mkdir /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 cd /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 printf '# killer whale vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Killer whale orcOrc1 SEQ2_DIR=/hive/data/genomes/orcOrc1/orcOrc1.2bit SEQ2_LEN=/hive/data/genomes/orcOrc1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 192m26.791s cat fb.mm10.chainOrcOrc1Link.txt # 832909116 bases of 2652783500 (31.398%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 orcOrc1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 276m44.875s mkdir /hive/data/genomes/orcOrc1/bed/blastz.mm10.swap cd /hive/data/genomes/orcOrc1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOrcOrc1.2017-06-15/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 72m53.064s cat fb.orcOrc1.chainMm10Link.txt # 809350350 bases of 2249582125 (35.978%) in intersection time (doRecipBest.pl -workhorse=hgwdev orcOrc1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 # real 214m50.810s ############################################################################## # LASTZ Baboon papAnu3 (DONE - 2017-06-21 - Hiram) # establish a screen to control this job screen -S mm10papAnu3 mkdir /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 cd /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 printf '# mouse vs. baboon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: baboon papAnu3 SEQ2_DIR=/hive/data/genomes/papAnu3/papAnu3.2bit SEQ2_LEN=/hive/data/genomes/papAnu3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=180 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 474m39.013s cat fb.mm10.chainPapAnu3Link.txt # 910628118 bases of 2652783500 (34.327%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 papAnu3 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 644m20.659s mkdir /hive/data/genomes/papAnu3/bed/blastz.mm10.swap cd /hive/data/genomes/papAnu3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapAnu3.2017-06-21/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 66m35.501s cat fb.papAnu3.chainMm10Link.txt # 897929517 bases of 2893270787 (31.035%) in intersection time (doRecipBest.pl -workhorse=hgwdev papAnu3 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 578m46.893s ############################################################################## # LASTZ pig susScr11 (DONE - 2017-07-31 - Hiram) # establish a screen to control this job screen -S mm10susScr11 mkdir /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 cd /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 printf '# mouse vs. pig BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: baboon susScr11 SEQ2_DIR=/hive/data/genomes/susScr11/susScr11.2bit SEQ2_LEN=/hive/data/genomes/susScr11/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=1 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 567m0.166s cat fb.mm10.chainSusScr11Link.txt # 731012356 bases of 2652783500 (27.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 susScr11 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 455m39.565s mkdir /hive/data/genomes/susScr11/bed/blastz.mm10.swap cd /hive/data/genomes/susScr11/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzSusScr11.2017-07-31/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 61m6.153s cat fb.susScr11.chainMm10Link.txt # 715277290 bases of 2472073034 (28.934%) in intersection time (doRecipBest.pl -workhorse=hgwdev susScr11 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 358m15.340s ############################################################################## # lastz nile tilapia oreNil3 (DONE - 2017-07-31 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10OreNil3 mkdir /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 cd /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 printf '# Mouse vs. nile tilapia BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # QUERY: nile tilapia oreNil3 SEQ2_DIR=/hive/data/genomes/oreNil3/oreNil3.2bit SEQ2_LEN=/hive/data/genomes/oreNil3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31 TMPDIR=/scratch/tmp ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 307m32.926s cat fb.mm10.chainOreNil3Link.txt # 54152663 bases of 2652783500 (2.041%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 oreNil3 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 243m27.139s # and for the swap mkdir /hive/data/genomes/oreNil3/bed/blastz.mm10.swap cd /hive/data/genomes/oreNil3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzOreNil3.2017-07-31/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 8m5.590s cat fb.oreNil3.chainMm10Link.txt # 55291586 bases of 1009856516 (5.475%) in intersection time (doRecipBest.pl -workhorse=hgwdev oreNil3 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 230m56.580s ######################################################################### # crispr 10K track (DONE - Hiram - 2017-07-28) # this script was developed during this procedure, thus, the step-wise # procedures: mkdir /hive/data/genomes/mm10/bed/crispr.10K cd /hive/data/genomes/mm10/bed/crispr.10K time (~/kent/src/hg/utils/automation/doCrispr.pl \ -stop=guides -buildDir=`pwd` mm10 ensGene) > guides.log 2>&1 # real 78m39.898s # Completed: 99 of 99 jobs # CPU time in finished jobs: 12182s 203.04m 3.38h 0.14d 0.000 y # IO & Wait Time: 1076s 17.93m 0.30h 0.01d 0.000 y # Average job time: 134s 2.23m 0.04h 0.00d # Longest finished job: 181s 3.02m 0.05h 0.00d # Submission to last job: 4567s 76.12m 1.27h 0.05d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=specScores \ -stop=specScores -buildDir=`pwd` mm10 ensGene # Completed: 945820 of 1558824 jobs # CPU time in finished jobs: 352722192s 5878703.20m 97978.39h 4082.43d 11.185 y # IO & Wait Time: 1367298315s 22788305.25m 379805.09h 15825.21d 43.357 y # Average job time: 1819s 30.31m 0.51h 0.02d # Longest finished job: 8656s 144.27m 2.40h 0.10d # Submission to last job: 2172942s 36215.70m 603.60h 25.15d # after ku reboot, finishing: # Completed: 613973 of 613973 jobs # CPU time in finished jobs: 155165030s 2586083.83m 43101.40h 1795.89d 4.920 y # IO & Wait Time: 584008656s 9733477.60m 162224.63h 6759.36d 18.519 y # Average job time: 1204s 20.07m 0.33h 0.01d # Longest finished job: 8978s 149.63m 2.49h 0.10d # Submission to last job: 1137188s 18953.13m 315.89h 13.16d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=effScores \ -stop=effScores -buildDir=`pwd` mm10 ensGene # Completed: 13518 of 13518 jobs # CPU time in finished jobs: 6244711s 104078.52m 1734.64h 72.28d 0.198 y # IO & Wait Time: 32457s 540.95m 9.02h 0.38d 0.001 y # Average job time: 464s 7.74m 0.13h 0.01d # Longest finished job: 2373s 39.55m 0.66h 0.03d # Submission to last job: 15145s 252.42m 4.21h 0.18d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=offTargets \ -stop=offTargets -buildDir=`pwd` mm10 ensGene # Completed: 77942 of 77942 jobs # CPU time in finished jobs: 1397706s 23295.10m 388.25h 16.18d 0.044 y # IO & Wait Time: 313616s 5226.94m 87.12h 3.63d 0.010 y # Average job time: 22s 0.37m 0.01h 0.00d # Longest finished job: 35s 0.58m 0.01h 0.00d # Submission to last job: 9239s 153.98m 2.57h 0.11d ~/kent/src/hg/utils/automation/doCrispr.pl -continue=load \ -stop=load -buildDir=`pwd` mm10 ensGene # real 235m41.378s ########################################################################## # FIXUP broken files (working - Max and Hiram - 2018-04,05) # Max generated a new specScores.tab, add in the chrM specScores # and make a unique set in a new specScores.tab file cd /hive/data/genomes/mm10/bed/crispr.10K/uniqSpecScores printf "targetSeq\tmitSpecScore\tofftargetCount\ttargetGenomeGeneLocus\n" \ > max.withChrM.specScores.tab grep -h -v targetSeq ../specScores.max.tab ../addChrM/specScores.tab \ | $HOME/bin/x86_64/gnusort -S100G --parallel=32 -u \ >> max.withChrM.specScores.tab # real 1m39.468s # this new file is much larger than before: # -rw-rw-r-- 1 3616703851 Jul 31 2017 withChrM.specScores.tab # -rw-rw-r-- 1 5580638498 May 15 14:55 max.withChrM.specScores.tab # Now generate a new crispr.bed and crispr.bb file mkdir /hive/data/genomes/mm10/bed/crispr.10K/maxBed cd /hive/data/genomes/mm10/bed/crispr.10K/maxBed # setup new inputs: ln -s ../addChrM/withChrM.allGuides.bed withChrM.allGuides.bed ln -s ../uniqSpecScores/max.withChrM.specScores.tab max.withChrM.specScores.tab ln -s ../addChrM/withChrM.effScores.tab withChrM.effScores.tab ln -s ../addChrM/withChrM.offtargets.offsets.tab withChrM.offtargets.offsets.tab ln -s ../addChrM/offTargets ./offTargets time (/cluster/software/bin/python \ /hive/data/outside/crisprTrack/scripts/createBigBed.py mm10 \ withChrM.allGuides.bed max.withChrM.specScores.tab \ withChrM.effScores.tab withChrM.offtargets.offsets.tab) > newBed.log 2>&1 # real 232m5.379s # -rw-rw-r-- 1 27947769791 May 15 17:55 crispr.bed # -rw-rw-r-- 1 6911180170 May 15 18:42 crispr.bb ############################################################################## # LASTZ Gorilla gorGor5 (DONE - 2017-08-04 - Hiram) # establish a screen to control this job screen -S mm10gorGor5 mkdir /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 cd /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 printf '# mouse vs. gorilla BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: gorilla gorGor5 SEQ2_DIR=/hive/data/genomes/gorGor5/gorGor5.2bit SEQ2_LEN=/hive/data/genomes/gorGor5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=130 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 170m18.102s cat fb.mm10.chainGorGor5Link.txt # 934147601 bases of 2652783500 (35.214%) in intersection time (doRecipBest.pl -workhorse=hgwdev mm10 gorGor5 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 327m34.879s mkdir /hive/data/genomes/gorGor5/bed/blastz.mm10.swap cd /hive/data/genomes/gorGor5/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGorGor5.2017-08-04/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 72m34.088s cat fb.gorGor5.chainMm10Link.txt # 990002546 bases of 3080431298 (32.138%) in intersection time (doRecipBest.pl -workhorse=hgwdev gorGor5 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 297m3.002s ############################################################################## # refSeqFuncElems NCBI refSeq functional elements, REDONE 2017-11-29 Angie # previously done 2017-08-08 by Chris E mkdir /hive/data/genomes/mm10/bed/refSeqFuncElems.2017-11-29 cd /hive/data/genomes/mm10/bed/refSeqFuncElems.2017-11-29 # NOTE FOR NEXT TIME: instead of using interim GFF, in the future these annotations might be # folded into the same main release GFF3 from which the ncbiRefSeq* tables are extracted by # doNcbiRefSeq.pl. wget ftp://ftp.ncbi.nlm.nih.gov/genomes/Mus_musculus/GFF_interim/interim_GRCm38.p6_top_level_2017-09-26.gff3.gz # Get mapping of RefSeq NC_* chromosome accs (and NT_*, NW_*) to mm10 chrom names hgsql mm10 -NBe 'select alias, chrom from chromAlias where source = "refseq" order by alias' \ > refSeqToChrom.tab cut -f 2 refSeqToChrom.tab | sed -e 's/^/^/' > chrom.tab # Use Terence Murphy's list of feature types (and the multi-type attribute regulatory_class) # to identify Functional Elements and swap in mm10 chrom names. # Use subColumn -miss so it doesn't quit when it sees a patch contig that doesn't map to an # mm10 chrom. Use grep -f chrom.tab to filter out patch contig annotations. zcat interim_GRCm38.p6_top_level_2017-09-26.gff3.gz \ | grep -P "(\t(CAAT_signal|GC_rich_promoter_region|TATA_box|enhancer|insulator|locus_control_region|mobile_genetic_element|origin_of_replication|promoter|protein_binding_site|recombination_feature|regulatory_region|repeat_region|sequence_feature|sequence_secondary_structure|silencer|stem_loop)\t|regulatory_class=)" \ | subColumn -miss=/dev/null 1 stdin refSeqToChrom.tab stdout \ | grep -f chrom.tab > funcElems.gff wc -l funcElems.gff #1968 funcElems.gff # Transform GFF to BED+ ~/kent/src/hg/utils/automation/parseRefSeqFuncElems funcElems.gff /dev/stdout \ | sort -k1,1 -k2n,2n > refSeqFuncElems.bed wc -l refSeqFuncElems.bed #1968 refSeqFuncElems.bed # Make bigBed and link from /gbdb bedToBigBed -tab -type=bed9+7 -as=$HOME/kent/src/hg/lib/refSeqFuncElems.as \ refSeqFuncElems.bed /hive/data/genomes/mm10/chrom.sizes refSeqFuncElems.bb rm -f /gbdb/mm10/ncbiRefSeq/refSeqFuncElems.bb ln -s `pwd`/refSeqFuncElems.bb /gbdb/mm10/ncbiRefSeq/ ############################################################################## 2017-09-15: import of UCSC GENCODE group processing of GENCODE VM15 (markd) # not to push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM15 pushd /hive/data/genomes/mm10/bed/gencodeVM15 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M15 90 'Aug 2017' # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM15.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM15 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all ############################################################################## # ncbiRefSeq composite (DONE - 2017-11-16 - Angie) # Previously done 2017-09-28; redone 11-16 to include mito "rna" from chrM genomic seq mkdir /hive/data/genomes/mm10/bed/ncbiRefSeq.p5.2017-11-16 cd /hive/data/genomes/mm10/bed/ncbiRefSeq.p5.2017-11-16 time (~/kent/src/hg/utils/automation/doNcbiRefSeq.pl -buildDir=`pwd` \ refseq vertebrate_mammalian Mus_musculus \ GCF_000001635.25_GRCm38.p5 mm10) > do.log 2>&1 & tail -f do.log # *** All done ! Elapsed time: 17m36s # real real 17m35.651s cat fb.ncbiRefSeq.mm10.txt # 105516336 bases of 2652783500 (3.978%) in intersection ############################################################################## # LASTZ Drill manLeu1 (DONE - 2017-09-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzManLeu1.2017-09-25 cd /hive/data/genomes/mm10/bed/lastzManLeu1.2017-09-25 printf '# drill vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Drill ManLeu1 SEQ2_DIR=/hive/data/genomes/manLeu1/manLeu1.2bit SEQ2_LEN=/hive/data/genomes/manLeu1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzManLeu1.2017-09-25 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10ManLeu1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 233m12.288s cat fb.mm10.chainManLeu1Link.txt # 905203366 bases of 2652783500 (34.123%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 manLeu1) \ > rbest.log 2>&1 & # real 362m58.840s mkdir /hive/data/genomes/manLeu1/bed/blastz.mm10.swap cd /hive/data/genomes/manLeu1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzManLeu1.2017-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 64m55.226s cat fb.manLeu1.chainMm10Link.txt # 895668222 bases of 2721424086 (32.912%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` manLeu1 mm10) \ > rbest.log 2>&1 # real 338m57.422s ############################################################################## # LASTZ Ma's night monkey aotNan1 (DONE - 2017-09-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 cd /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 printf '# Ma_s night monkey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Ma_s night monkey AotNan1 SEQ2_DIR=/hive/data/genomes/aotNan1/aotNan1.2bit SEQ2_LEN=/hive/data/genomes/aotNan1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10AotNan1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 400m13.309s cat fb.mm10.chainAotNan1Link.txt # 889500682 bases of 2652783500 (33.531%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 aotNan1) \ > rbest.log 2>&1 & # real 352m12.077s mkdir /hive/data/genomes/aotNan1/bed/blastz.mm10.swap cd /hive/data/genomes/aotNan1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAotNan1.2017-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 68m48.755s cat fb.aotNan1.chainMm10Link.txt # 893851318 bases of 2714439490 (32.929%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` aotNan1 mm10) \ > rbest.log 2>&1 # real 383m10.761s ############################################################################## # LASTZ Hawaiian monk seal neoSch1 (DONE - 2017-09-25 - Hiram) # establish a screen to control this job screen -S mm10neoSch1 mkdir /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 cd /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 printf '# mouse vs. Hawaiian monk seal BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LAP=10000 SEQ1_LIMIT=1 # QUERY: Hawaiian monk seal neoSch1 SEQ2_DIR=/hive/data/genomes/neoSch1/neoSch1.2bit SEQ2_LEN=/hive/data/genomes/neoSch1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 324m0.457s cat fb.mm10.chainNeoSch1Link.txt # 827926012 bases of 2652783500 (31.210%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 neoSch1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 307m18.396s cat fb.mm10.chainRBestNeoSch1Link.txt # 788489846 bases of 2652783500 (29.723%) in intersection mkdir /hive/data/genomes/neoSch1/bed/blastz.mm10.swap cd /hive/data/genomes/neoSch1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzNeoSch1.2017-09-25/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 59m27.809s cat fb.neoSch1.chainMm10Link.txt # 804021579 bases of 2400839308 (33.489%) in intersection cat fb.neoSch1.chainSynMm10Link.txt # 776155245 bases of 2400839308 (32.328%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev neoSch1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 242m31.157s cat fb.neoSch1.chainRBestMm10Link.txt # 787537751 bases of 2400839308 (32.803%) in intersection ############################################################################## # LASTZ Sooty mangabey cerAty1 (DONE - 2017-09-27 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 cd /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 printf '# Sooty mangabey vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Sooty mangabey CerAty1 SEQ2_DIR=/hive/data/genomes/cerAty1/cerAty1.2bit SEQ2_LEN=/hive/data/genomes/cerAty1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=30 BASE=/hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CerAty1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 371m15.075s cat fb.mm10.chainCerAty1Link.txt # 917680202 bases of 2652783500 (34.593%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cerAty1) \ > rbest.log 2>&1 & # real 345m49.786s mkdir /hive/data/genomes/cerAty1/bed/blastz.mm10.swap cd /hive/data/genomes/cerAty1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCerAty1.2017-09-27/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 68m6.225s cat fb.cerAty1.chainMm10Link.txt # 903892923 bases of 2787289397 (32.429%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cerAty1 mm10) \ > rbest.log 2>&1 # real 305m14.804s ############################################################################## # LASTZ Coquerel's sifaka to mouse/Mm10 (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 cd /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 printf '# Coquerel_s sifaka vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: proCoq1 - Coquerel_s sifaka - Propithecus coquereli SEQ2_DIR=/hive/data/genomes/proCoq1/proCoq1.2bit SEQ2_LEN=/hive/data/genomes/proCoq1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10ProCoq1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 294m43.931s cat fb.mm10.chainProCoq1Link.txt # 882327683 bases of 2652783500 (33.260%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 proCoq1) \ > rbest.log 2>&1 & # real 411m5.774s mkdir /hive/data/genomes/proCoq1/bed/blastz.mm10.swap cd /hive/data/genomes/proCoq1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzProCoq1.2017-09-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 62m48.333s cat fb.proCoq1.chainMm10Link.txt # 863635783 bases of 2083764538 (41.446%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` proCoq1 mm10) \ > rbest.log 2>&1 # real 357m54.198s ############################################################################## # LASTZ White-faced sapajou to mouse/Mm10 (DONE - 2017-09-28 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 cd /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 printf '# White-faced sapajou vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cebCap1 - White-faced sapajou - Cebus capucinus imitator SEQ2_DIR=/hive/data/genomes/cebCap1/cebCap1.2bit SEQ2_LEN=/hive/data/genomes/cebCap1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=20 BASE=/hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CebCap1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 293m40.906s cat fb.mm10.chainCebCap1Link.txt # 882776669 bases of 2652783500 (33.277%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cebCap1) \ > rbest.log 2>&1 & # real 334m0.458s mkdir /hive/data/genomes/cebCap1/bed/blastz.mm10.swap cd /hive/data/genomes/cebCap1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCebCap1.2017-09-28/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 63m12.596s cat fb.cebCap1.chainMm10Link.txt # 871126707 bases of 2610518382 (33.370%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cebCap1 mm10) \ > rbest.log 2>&1 # real 299m3.923s ############################################################################## # LASTZ White-faced spapjou/cebCap1 vs. mouse/Mm10 (DONE - 2017-10-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 cd /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 printf '# White-faced sapajou vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cebCap1 - White-faced sapajou - Cebus capucinus imitator SEQ2_DIR=/hive/data/genomes/cebCap1/cebCap1.2bit SEQ2_LEN=/hive/data/genomes/cebCap1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=18 BASE=/hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10CebCap1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 206m12.413s cat fb.mm10.chainCebCap1Link.txt # 882776669 bases of 2652783500 (33.277%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 cebCap1) \ > rbest.log 2>&1 & # real 331m49.541s mkdir /hive/data/genomes/cebCap1/bed/blastz.mm10.swap cd /hive/data/genomes/cebCap1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCebCap1.2017-10-03/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 63m12.596s cat fb.cebCap1.chainMm10Link.txt # 871126707 bases of 2610518382 (33.370%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` cebCap1 mm10) \ > rbest.log 2>&1 # real 299m3.923s ############################################################################## # LASTZ Sclater's lemur mouse/Mm10 (DONE - 2017-10-04 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 cd /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 printf '# Sclater_s lemur vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: eulFla1 - Sclater_s lemur - Eulemur flavifrons SEQ2_DIR=/hive/data/genomes/eulFla1/eulFla1.2bit SEQ2_LEN=/hive/data/genomes/eulFla1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=18 BASE=/hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10EulFla1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 144m17.701s cat fb.mm10.chainEulFla1Link.txt # 916687191 bases of 2652783500 (34.556%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 eulFla1) \ > rbest.log 2>&1 & # real 330m53.327s mkdir /hive/data/genomes/eulFla1/bed/blastz.mm10.swap cd /hive/data/genomes/eulFla1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEulFla1.2017-10-04/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 65m26.113s cat fb.eulFla1.chainMm10Link.txt # 887070088 bases of 2094103399 (42.360%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` eulFla1 mm10) \ > rbest.log 2>&1 # real 270m35.579s ############################################################################## # LASTZ Black lemur mouse/Mm10 (DONE - 2017-10-05 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 cd /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 printf '# Black lemur vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: eulMac1 - Black lemur - Eulemur macaco SEQ2_DIR=/hive/data/genomes/eulMac1/eulMac1.2bit SEQ2_LEN=/hive/data/genomes/eulMac1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10EulMac1 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 167m31.736s cat fb.mm10.chainEulMac1Link.txt # 925968814 bases of 2652783500 (34.906%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` mm10 eulMac1) \ > rbest.log 2>&1 & # real 334m49.287s mkdir /hive/data/genomes/eulMac1/bed/blastz.mm10.swap cd /hive/data/genomes/eulMac1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEulMac1.2017-10-05/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 64m52.738s cat fb.eulMac1.chainMm10Link.txt # 895308387 bases of 2101039320 (42.613%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` eulMac1 mm10) \ > rbest.log 2>&1 # real 267m17.552s ############################################################################## 2017-12-17: import of UCSC GENCODE group processing of GENCODE VM16 (markd) # being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM16 pushd /hive/data/genomes/mm10/bed/gencodeVM16 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M16 91 'Dec 2017' # Update mouse/mm10/wgEncodeGencodeSuper.html and update 'Release Notes' # to describe new release. [ONLY if it's going to be pushed] # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM16.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM16 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. ############################################################################## # LASTZ Damara mole rat vs. mouse/Mm10 (DONE - 2018-01-01 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 cd /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 printf '# Damara mole rat vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Damara mole rat SEQ2_DIR=/hive/data/genomes/fukDam1/fukDam1.2bit SEQ2_LEN=/hive/data/genomes/fukDam1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 403m29.477s cat fb.mm10.chainFukDam1Link.txt # 803448015 bases of 2652783500 (30.287%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 fukDam1) \ > rbest.log 2>&1 & # real 391m52.435s cat fb.mm10.chainRBestFukDam1Link.txt # 760138280 bases of 2652783500 (28.654%) in intersection mkdir /hive/data/genomes/fukDam1/bed/blastz.mm10.swap cd /hive/data/genomes/fukDam1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFukDam1.2018-01-01/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 79m46.564s cat fb.fukDam1.chainMm10Link.txt # 803988546 bases of 2285984782 (35.170%) in intersection cat fb.fukDam1.chainSynMm10Link.txt # 741604346 bases of 2285984782 (32.441%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` fukDam1 mm10) \ > rbest.log 2>&1 # real 417m52.847s cat fb.fukDam1.chainRBestMm10Link.txt # 760190877 bases of 2285984782 (33.254%) in intersection ############################################################################## # LASTZ Kangaroo rat vs. mouse/Mm10 (DONE - 2018-01-01 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 cd /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 printf '# Kangaroo rat vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Kangaroo rat SEQ2_DIR=/hive/data/genomes/dipOrd2/dipOrd2.2bit SEQ2_LEN=/hive/data/genomes/dipOrd2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 351m30.983s cat fb.mm10.chainDipOrd2Link.txt # 645178768 bases of 2652783500 (24.321%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 dipOrd2) \ > rbest.log 2>&1 & # real 439m56.601s cat fb.mm10.chainRBestDipOrd2Link.txt # 605074450 bases of 2652783500 (22.809%) in intersection mkdir /hive/data/genomes/dipOrd2/bed/blastz.mm10.swap cd /hive/data/genomes/dipOrd2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzDipOrd2.2018-01-01/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 79m46.564s cat fb.dipOrd2.chainMm10Link.txt # 631879699 bases of 2065314047 (30.595%) in intersection cat fb.dipOrd2.chainSynMm10Link.txt # 581661824 bases of 2065314047 (28.163%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` dipOrd2 mm10) \ > rbest.log 2>&1 # real 412m53.879s cat fb.dipOrd2.chainRBestMm10Link.txt # 605056621 bases of 2065314047 (29.296%) in intersection ############################################################################## # LASTZ Chinese hamster ovary cell line CHO-K1 criGriChoV2 # (DONE - 2018-01-05 - Hiram) # establish a screen to control this job screen -S mm10criGriChoV2 mkdir /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 cd /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 printf '# Chinese hamster ovary cell line vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: Chinese hamster ovary cell line CHO-K1 criGriChoV2 SEQ2_DIR=/hive/data/genomes/criGriChoV2/criGriChoV2.2bit SEQ2_LEN=/hive/data/genomes/criGriChoV2/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -noDbNameCheck -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 575m28.254s cat fb.mm10.chainCriGriChoV2Link.txt # 1583859515 bases of 2652783500 (59.706%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 criGriChoV2 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 1098m32.629s cat fb.mm10.chainRBestCriGriChoV2Link.txt # 1451345011 bases of 2652783500 (54.710%) in intersection mkdir /hive/data/genomes/criGriChoV2/bed/blastz.mm10.swap cd /hive/data/genomes/criGriChoV2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCriGriChoV2.2018-01-05/DEF \ -noDbNameCheck -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 196m59.409s cat fb.criGriChoV2.chainMm10Link.txt # 1605002950 bases of 2323924942 (69.064%) in intersection cat fb.criGriChoV2.chainSynMm10Link.txt # 1443603212 bases of 2323924942 (62.119%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev criGriChoV2 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 1187m10.728s cat fb.criGriChoV2.chainRBestMm10Link.txt # 1452526554 bases of 2323924942 (62.503%) in intersection ############################################################################## # LASTZ Baboon papAnu4 (DONE - 2018-01-08 - Hiram) # establish a screen to control this job screen -S mm10papAnu4 mkdir /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 cd /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 printf '# mouse vs. baboon BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=40 # QUERY: baboon papAnu4 SEQ2_DIR=/hive/data/genomes/papAnu4/papAnu4.2bit SEQ2_LEN=/hive/data/genomes/papAnu4/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=180 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 783m49.438s cat fb.mm10.chainPapAnu4Link.txt # 919405716 bases of 2652783500 (34.658%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 papAnu4 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 582m15.183s cat fb.mm10.chainRBestPapAnu4Link.txt # 875366631 bases of 2652783500 (32.998%) in intersection mkdir /hive/data/genomes/papAnu4/bed/blastz.mm10.swap cd /hive/data/genomes/papAnu4/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPapAnu4.2018-01-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 80m51.648s cat fb.papAnu4.chainMm10Link.txt # 907806517 bases of 2937004939 (30.909%) in intersection cat fb.papAnu4.chainSynMm10Link.txt # 866781916 bases of 2937004939 (29.512%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev papAnu4 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 521m7.590s cat fb.papAnu4.chainRBestMm10Link.txt # 874097827 bases of 2937004939 (29.762%) in intersection ############################################################################## # LASTZ guinea pig cavApe1 (DONE - 2018-01-08 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10CavApe1 mkdir /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 cd /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 printf '# guinea pig vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: guinea pig CavApe1 SEQ2_DIR=/hive/data/genomes/cavApe1/cavApe1.2bit SEQ2_LEN=/hive/data/genomes/cavApe1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=10 BASE=/hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 514m28.099s cat fb.mm10.chainCavApe1Link.txt # 424603451 bases of 2652783500 (16.006%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 cavApe1 \ -buildDir=`pwd`) > rbest.log 2>&1 & # real 481m13.804s cat fb.mm10.chainRBestCavApe1Link.txt # 394844156 bases of 2652783500 (14.884%) in intersection # and for the swap mkdir /hive/data/genomes/cavApe1/bed/blastz.mm10.swap cd /hive/data/genomes/cavApe1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCavApe1.2018-01-08/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 38m53.866s cat fb.cavApe1.chainMm10Link.txt # 420563721 bases of 1749140834 (24.044%) in intersection cat fb.cavApe1.chainSynMm10Link.txt # 364825271 bases of 1749140834 (20.857%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev cavApe1 mm10 \ -buildDir=`pwd`) > rbest.log 2>&1 & # real 438m45.544s cat fb.cavApe1.chainRBestMm10Link.txt # 395976886 bases of 1749140834 (22.638%) in intersection ############################################################################## # lastz Medium Ground Finch ficAlb1 (DONE - 2018-01-09 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10 mkdir /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 cd /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 printf '# Mouse vs. Collared flycatcher BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Collard flycatcher/FicAlb1 SEQ2_DIR=/hive/data/genomes/ficAlb1/ficAlb1.2bit SEQ2_LEN=/hive/data/genomes/ficAlb1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 167m34.472s cat fb.mm10.chainFicAlb1Link.txt # 98177848 bases of 2652783500 (3.701%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev mm10 ficAlb1 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 246m1.019s cat fb.mm10.chainRBestFicAlb1Link.txt # 76370866 bases of 2652783500 (2.879%) in intersection # and for the swap mkdir /hive/data/genomes/ficAlb1/bed/blastz.mm10.swap cd /hive/data/genomes/ficAlb1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFicAlb1.2018-01-09/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 8m5.637s cat fb.ficAlb1.chainMm10Link.txt # 85384367 bases of 1102325870 (7.746%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev ficAlb1 mm10 \ -buildDir=`pwd` -workhorse=hgwdev) > rbest.log 2>&1 & # real 209m22.159s cat fb.ficAlb1.chainRBestMm10Link.txt # 76183235 bases of 1102325870 (6.911%) in intersection ########################################################################## # lastz Lamprey petMar3 (DONE - 2018-01-25 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S petMar3 mkdir /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 cd /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 printf '# Mouse vs. Lamprey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Lamprey PetMar3 SEQ2_DIR=/hive/data/genomes/petMar3/petMar3.2bit SEQ2_LEN=/hive/data/genomes/petMar3/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=60 BASE=/hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 119m5.528s cat fb.mm10.chainPetMar3Link.txt # 36835173 bases of 2652783500 (1.389%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 petMar3) \ > rbest.log 2>&1 & # real 201m40.789s cat fb.mm10.chainRBestPetMar3Link.txt # 21623456 bases of 2652783500 (0.815%) in intersection # and for the swap mkdir /hive/data/genomes/petMar3/bed/blastz.mm10.swap cd /hive/data/genomes/petMar3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPetMar3.2018-01-25/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 7m57.582s # real 7m2.754s cat fb.petMar3.chainMm10Link.txt # 39217857 bases of 1043181598 (3.759%) in intersection cat fb.petMar3.chainSynMm10Link.txt # 1381239 bases of 1043181598 (0.132%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` petMar3 mm10) \ > rbest.log 2>&1 & # real 206m59.727s cat fb.petMar3.chainRBestMm10Link.txt # 21335101 bases of 1043181598 (2.045%) in intersection ######################################################################### 2018-03-08: update UCSC GENCODE VM16 to include protein id (for VAI) and fix PAR tag cd /hive/data/genomes/mm10/bed/gencodeVM16 # save existing data mkdir -p prev/pre-proteinId mv tables/wgEncodeGencodeAttrsVM16.tab tables/wgEncodeGencodeTagVM16.tab prev/pre-proteinId/ mv loaded/wgEncodeGencodeAttrsVM16.tab.loaded loaded/wgEncodeGencodeTagVM16.tab.loaded prev/pre-proteinId/ mv data/gencode.tsv prev/pre-proteinId/ cp -p data/gencode.vM16.transcriptionSupportLevel.tab prev/pre-proteinId/ # edit gencodeLoad.mk to set mm10 as target # get gencode.tsv without rebuild TSL file or loading tables that don't change ~markd/compbio/ccds/ccds2/output/bin/x86_64/opt/gencodeGxfToAttrs --keepGoing data/release_M16/gencode.vM16.chr_patch_hapl_scaff.annotation.gtf.gz data/gencode.tsv make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk loaded/wgEncodeGencodeAttrsVM16.tab.loaded loaded/wgEncodeGencodeTagVM16.tab.loaded # 2018-03-19: update search to include protein id cd kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M16 91 'Dec 2017' ######################################################################### # lastz garter snake/thaSir1 (DONE - 2018-03-13 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10ThaSir1 mkdir /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 cd /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 # note: first time with this new 1.04.00 version of lastz printf '# Mouse vs. garter snake BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_M=50 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: garter snake thaSir1 SEQ2_DIR=/hive/data/genomes/thaSir1/thaSir1.2bit SEQ2_LEN=/hive/data/genomes/thaSir1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LAP=0 SEQ2_LIMIT=15 BASE=/hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13 TMPDIR=/dev/shm ' > DEF # adjust the SEQ2_LIMIT with -stop=partition to get a reasonable # number of jobs, 50,000 to something under 100,000 # when not present, SEQ2_LIMIT is a default 100 time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 & # real 112m40.572s cat fb.mm10.chainThaSir1Link.txt # 78464036 bases of 2652783500 (2.958%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 thaSir1) \ > rbest.log 2>&1 & # real 266m17.520s cat fb.mm10.chainRBestThaSir1Link.txt # 54099233 bases of 2652783500 (2.039%) in intersection # and for the swap mkdir /hive/data/genomes/thaSir1/bed/blastz.mm10.swap cd /hive/data/genomes/thaSir1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 -syntenicNet \ /hive/data/genomes/mm10/bed/lastzThaSir1.2018-03-13/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 & # real 11m28.892s cat fb.thaSir1.chainMm10Link.txt # 63814138 bases of 1122701795 (5.684%) in intersection cat fb.thaSir1.chainSynMm10Link.txt # 20728394 bases of 1122701795 (1.846%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` thaSir1 mm10) \ > rbest.log 2>&1 & # real 234m31.934s cat fb.thaSir1.chainRBestMm10Link.txt # 54778217 bases of 1122701795 (4.879%) in intersection ############################################################################## # LASTZ cat felCat9 (DONE - 2018-03-14 - Hiram) # establish a screen to control this job with a name to indicate what it is screen -S mm10FelCat9 mkdir /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 cd /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 printf '# cat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: cat FelCat9 SEQ2_DIR=/hive/data/genomes/felCat9/felCat9.2bit SEQ2_LEN=/hive/data/genomes/felCat9/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 395m23.091s cat fb.mm10.chainFelCat9Link.txt # 801023018 bases of 2652783500 (30.196%) in intersection time (doRecipBest.pl -load mm10 felCat9 -buildDir=`pwd` \ -workhorse=hgwdev) > rbest.log 2>&1 & # real 486m55.606s cat fb.mm10.chainRBestFelCat9Link.txt # 761411281 bases of 2652783500 (28.702%) in intersection mkdir /hive/data/genomes/felCat9/bed/blastz.mm10.swap cd /hive/data/genomes/felCat9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzFelCat9.2018-03-14/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 & # real 70m51.860s cat fb.felCat9.chainMm10Link.txt # 779862191 bases of 2476453204 (31.491%) in intersection cat fb.felCat9.chainSynMm10Link.txt # 754481540 bases of 2476453204 (30.466%) in intersection time (doRecipBest.pl -load felCat9 mm10 -buildDir=`pwd` \ -workhorse=hgwdev) > rbest.log 2>&1 & # real 375m4.937s cat fb.felCat9.chainRBestMm10Link.txt # 760753851 bases of 2476453204 (30.719%) in intersection ############################################################################## # LASTZ Beaver casCan1 vs. mouse/Mm10 (DONE - 2018-03-19 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 cd /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 # note: first time with this new 1.04.00 version of lastz printf '# Beaver vs. mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LIMIT=50 SEQ1_LAP=10000 # QUERY: Beaver SEQ2_DIR=/hive/data/genomes/casCan1/casCan1.2bit SEQ2_LEN=/hive/data/genomes/casCan1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=50 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job time (doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 & # real 455m47.982s cat fb.mm10.chainCasCan1Link.txt # 969752969 bases of 2652783500 (36.556%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 casCan1) \ > rbest.log 2>&1 & # real 981m12.451s cat fb.mm10.chainRBestCasCan1Link.txt # 912108399 bases of 2652783500 (34.383%) in intersection mkdir /hive/data/genomes/casCan1/bed/blastz.mm10.swap cd /hive/data/genomes/casCan1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCasCan1.2018-03-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 100m12.450s cat fb.casCan1.chainMm10Link.txt # 1027587643 bases of 2517974654 (40.810%) in intersection cat fb.casCan1.chainSynMm10Link.txt # 876969229 bases of 2517974654 (34.828%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` casCan1 mm10) \ > rbest.log 2>&1 # real 1280m7.127s cat fb.casCan1.chainRBestMm10Link.txt # 911437520 bases of 2517974654 (36.197%) in intersection ############################################################################## # LASTZ mouse/mm10 Chimp/panTro6 - (DONE - 2018-03-24 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 cd /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 printf '# mouse vs chimp BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=10 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: chimp panTro6 SEQ2_DIR=/hive/data/genomes/panTro6/panTro6.2bit SEQ2_LEN=/hive/data/genomes/panTro6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 347m21.874s cat fb.mm10.chainPanTro6Link.txt # 935720585 bases of 2652783500 (35.273%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ mm10 panTro6) > rbest.log 2>&1 & # real 565m15.871s cat fb.mm10.chainRBestPanTro6Link.txt # 891553355 bases of 2652783500 (33.608%) in intersection # and for the swap: mkdir /hive/data/genomes/panTro6/bed/blastz.mm10.swap cd /hive/data/genomes/panTro6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPanTro6.2018-03-24/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 78m57.631s cat fb.panTro6.chainMm10Link.txt # 934668641 bases of 3018592990 (30.964%) in intersection cat fb.panTro6.chainSynMm10Link.txt # 889944141 bases of 3018592990 (29.482%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ panTro6 mm10) > rbest.log 2>&1 & # real 504m47.811s cat fb.panTro6.chainRBestMm10Link.txt # 890065520 bases of 3018592990 (29.486%) in intersection ############################################################################## # LASTZ mouse/mm10 Orangutan/ponAbe3 - (DONE - 2018-03-26 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 cd /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 printf '# mouse vs orangutan BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=5 # the default matrix is: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # QUERY: orangutan ponAbe3 SEQ2_DIR=/hive/data/genomes/ponAbe3/ponAbe3.2bit SEQ2_LEN=/hive/data/genomes/ponAbe3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=20 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 461m46.426s cat fb.mm10.chainPonAbe3Link.txt # 936755064 bases of 2652783500 (35.312%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ mm10 ponAbe3) > rbest.log 2>&1 & # real 554m41.676s cat fb.mm10.chainRBestPonAbe3Link.txt # 892145302 bases of 2652783500 (33.631%) in intersection # and for the swap: mkdir /hive/data/genomes/ponAbe3/bed/blastz.mm10.swap cd /hive/data/genomes/ponAbe3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzPonAbe3.2018-03-26/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 78m29.160s cat fb.ponAbe3.chainMm10Link.txt # 929970181 bases of 3043444524 (30.557%) in intersection cat fb.ponAbe3.chainSynMm10Link.txt # 890801507 bases of 3043444524 (29.270%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ ponAbe3 mm10) > rbest.log 2>&1 & # real 496m49.168s cat fb.ponAbe3.chainRBestMm10Link.txt # 890774155 bases of 3043444524 (29.269%) in intersection ######################################################################### # LASTZ mouse/mm10 sheep/oviAri4 - (DONE - 2018-04-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 cd /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 printf '# mouse vs sheep BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: sheep oviAri4 SEQ2_DIR=/hive/data/genomes/oviAri4/oviAri4.2bit SEQ2_LEN=/hive/data/genomes/oviAri4/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # Command failed: # ssh -x -o 'StrictHostKeyChecking = no' -o 'BatchMode = yes' hgwdev \ # nice /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25/axtChain/netSynteny.csh # # real 237m24.916s # used the wrong version of doBlastzChainNet.pl which failed at the # syntenic net step. Clean up and re-try with the fixed up script: rm mm10.oviAri4.syn.chain.gz rm mm10.oviAri4.syn.net.gz time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -continue=syntenicNet \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) >> do.log 2>&1 & # real 18m40.051s cat fb.mm10.chainOviAri4Link.txt # 693504453 bases of 2652783500 (26.143%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 oviAri4) > rbest.log 2>&1 & # real 485m29.546s # and for the swap: mkdir /hive/data/genomes/oviAri4/bed/blastz.mm10.swap cd /hive/data/genomes/oviAri4/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzOviAri4.2018-04-25/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 63m12.935s cat fb.oviAri4.chainMm10Link.txt # 680117358 bases of 2587515673 (26.285%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` oviAri4 mm10) > rbest.log 2>&1 & # real 437m1.637s ######################################################################### # RepeatMasker Visualization track update (TBD - 2018-05-15 - ChrisL) screen -S rmskJoined.2018-05-15 # if this is an update to an already existing rmsk build, re-run # masking with new libraries. Otherwise skip to rmskJoined below mkdir /hive/data/genomes/mm10/bed/repeatMasker.2018-05-15 cd /hive/data/genomes/mm10/bed/repeatMasker.2018-05-15 time (doRepeatMasker.pl -stop=mask -bigClusterHub=ku \ -workhorse=hgwdev -dbHost=hgwdev -buildDir=`pwd` mm10) > mask.log 2>&1 & # real 705m12.538s # fill in grep to get rid of the missing id items (not necessary this run): # grep -v "" \ # mm10.fa.out > clean.mm10.fa.out # mv clean.mm10.fa.out mm10.fa.out # finish the last step of doCat.csh, if necessary: # /cluster/bin/scripts/extractNestedRepeats.pl mm10.fa.out | sort -k1,1 -k2,2n > mm10.nestedRepeats.bed # rmskJoinedCurrent steps mkdir /hive/data/genomes/mm10/bed/rmskJoined.2018-05-15 cd /hive/data/genomes/mm10/bed/rmskJoined.2018-05-15 ln -s ../repeatMasker.2018-05-15/mm10.sorted.fa.out . ln -s ../repeatMasker.2018-05-15/mm10.fa.align . time (/scratch/data/RepeatMasker/util/rmToUCSCTables.pl \ -out mm10.sorted.fa.out -align mm10.fa.align.gz) > rerun.log 2>&1 & # real 102m53.576s # confirm the counts are different from the previous version: # wc -l ../rmskJoined/mm10.fa.align.tsv ../rmskJoined/mm10.sorted.fa.join.bed ../rmskJoined/mm10.sorted.fa.out.tsv 5918456 ../rmskJoined/mm10.fa.align.tsv 4657599 ../rmskJoined/mm10.sorted.fa.join.bed 5249545 ../rmskJoined/mm10.sorted.fa.out.tsv 15825600 total # wc -l *.tsv 5888031 mm10.fa.align.tsv 4646880 mm10.sorted.fa.join.tsv 5235053 mm10.sorted.fa.out.tsv 15769964 total # sub rmskJoinedBaseline for rmskJoinedCurrent if this is the first version for this assembly hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/rmskJoined.sql \ -renameSqlTable -verbose=4 -tab \ -type=bed4+9 -as=$HOME/kent/src/hg/lib/rmskJoined.as mm10 \ rmskJoinedCurrent mm10.sorted.fa.join.tsv \ > loadJoined.log 2>&1 # Error line 1028733 of mm10.sorted.fa.join.tsv: # chromStart after chromEnd (21000277 > 21000266) # is it the only one ? awk -F'\t' '{if ($2 > $3) sum+=1} END {print sum}' mm10.sorted.fa.join.tsv # 1 # remove it and run above hgLoadBed again: awk -F'\t' '{if ($2 < $3) print;}' mm10.sorted.fa.join.tsv > mm10.sorted.fa.join.cleaned mv mm10.sorted.fa.join.cleaned mm10.sorted.fa.join.tsv # sub rmskAlignBaseline for rmskAlignCurrent if this is the first version for this assembly hgLoadSqlTab mm10 rmskAlignCurrent \ /cluster/home/chmalee/kent/src/hg/lib/rmskAlign.sql \ mm10.fa.align.tsv > loadAlign.log 2>&1 # sub rmskOutBaseline for rmskOutCurrent if this is the first version for this assembly hgLoadOutJoined -verbose=2 -table=rmskOutCurrent mm10 mm10.sorted.fa.out > loadOut.log 2>&1 featureBits -countGaps mm10 rmskJoinedBaseline # 2243948952 bases of 2730871774 (82.170%) in intersection featureBits -countGaps mm10 rmskJoinedCurrent # 2249729653 bases of 2730871774 (82.381%) in intersection ######################################################################### # LASTZ mouse/mm10 horse/equCab3 - (DONE - 2018-05-25 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 cd /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 printf '# mouse vs horse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: horse equCab3 SEQ2_DIR=/hive/data/genomes/equCab3/equCab3.2bit SEQ2_LEN=/hive/data/genomes/equCab3/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 605m50.368s cat fb.mm10.chainEquCab3Link.txt # 921489718 bases of 2652783500 (34.737%) in intersection cat fb.mm10.chainSynEquCab3Link.txt # 876836391 bases of 2652783500 (33.053%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 equCab3) > rbest.log 2>&1 & # real 398m20.685s cat fb.mm10.chainRBest.EquCab3.txt # 876785778 bases of 2652783500 (33.052%) in intersection # and for the swap: mkdir /hive/data/genomes/equCab3/bed/blastz.mm10.swap cd /hive/data/genomes/equCab3/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzEquCab3.2018-05-25/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 83m14.250s cat fb.equCab3.chainMm10Link.txt # 930516778 bases of 2497530654 (37.257%) in intersection cat fb.equCab3.chainSynMm10Link.txt # 897238830 bases of 2497530654 (35.925%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` equCab3 mm10) > rbest.log 2>&1 & # real 318m40.520s cat fb.equCab3.chainRBest.Mm10.txt # 875954606 bases of 2497530654 (35.073%) in intersection ######################################################################### # LASTZ mouse/mm10 Minke whale/balAcu1 - (DONE - 2018-06-13 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 cd /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 printf '# mouse vs Minke whale BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: Minke whale balAcu1 SEQ2_DIR=/hive/data/genomes/balAcu1/balAcu1.2bit SEQ2_LEN=/hive/data/genomes/balAcu1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=40 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 190m45.265s cat fb.mm10.chainBalAcu1Link.txt # 851790136 bases of 2652783500 (32.109%) in intersection cat fb.mm10.chainSynBalAcu1Link.txt # 806407823 bases of 2652783500 (30.399%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 balAcu1) > rbest.log 2>&1 & # real 287m58.329s cat fb.mm10.chainRBest.BalAcu1.txt # 811435554 bases of 2652783500 (30.588%) in intersection # and for the swap: mkdir /hive/data/genomes/balAcu1/bed/blastz.mm10.swap cd /hive/data/genomes/balAcu1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBalAcu1.2018-06-13/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 67m0.560s cat fb.balAcu1.chainMm10Link.txt # 832845143 bases of 2286657046 (36.422%) in intersection cat fb.balAcu1.chainSynMm10Link.txt # 802734600 bases of 2286657046 (35.105%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` balAcu1 mm10) > rbest.log 2>&1 & # real 241m51.110s cat fb.balAcu1.chainRBest.Mm10.txt # 810427625 bases of 2286657046 (35.442%) in intersection ######################################################################### 2018-07-01: import of UCSC GENCODE group processing of GENCODE VM17 (markd) # not being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM17 pushd /hive/data/genomes/mm10/bed/gencodeVM17 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M17 92 'Mar 2018' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM17.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM17 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. ############################################################################## # LASTZ mouse/mm10 Axolotl/ambMex1 - (DONE - 2018-07-09 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 cd /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 printf '# mouse vs Axolotl BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: Axolotl ambMex1 SEQ2_DIR=/hive/data/genomes/ambMex1/ambMex1.2bit SEQ2_LEN=/hive/data/genomes/ambMex1/chrom.sizes SEQ2_CHUNK=80000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 881m7.910s cat fb.mm10.chainAmbMex1Link.txt # 52143617 bases of 2652783500 (1.966%) in intersection cat fb.mm10.chainSynAmbMex1Link.txt # 2686570 bases of 2652783500 (0.101%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 ambMex1) > rbest.log 2>&1 & # real 478m39.331s # something odd went haywire at the download step time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` mm10 ambMex1) > download.log 2>&1 & # real 1m42.883s cat fb.mm10.chainRBest.AmbMex1.txt # 36938030 bases of 2652783500 (1.392%) in intersection # and for the swap: mkdir /hive/data/genomes/ambMex1/bed/blastz.mm10.swap cd /hive/data/genomes/ambMex1/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzAmbMex1.2018-07-09/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 39m28.757s cat fb.ambMex1.chainMm10Link.txt # 87124587 bases of 28366694468 (0.307%) in intersection cat fb.ambMex1.chainSynMm10Link.txt # 2893381 bases of 28366694468 (0.010%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > rbest.log 2>&1 & # real 568m10.621s # something odd went haywire at the download step time (doRecipBest.pl -load -continue=download -workhorse=hgwdev -buildDir=`pwd` ambMex1 mm10) > download.log 2>&1 & # real 3m16.404s cat fb.ambMex1.chainRBest.Mm10.txt # 38584422 bases of 28366694468 (0.136%) in intersection ############################################################################## 2018-08-03: import of UCSC GENCODE group processing of GENCODE VM18 (markd) # being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM18 pushd /hive/data/genomes/mm10/bed/gencodeVM18 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M18 93 'July 2018' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM18.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM18 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ############################################################################## # LASTZ mouse/mm10 vs. chicken/galGal6 - (DONE - 2018-10-12 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 cd /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 printf "# Mouse vs. chicken BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: chicken galGal6 SEQ2_DIR=/hive/data/genomes/galGal6/galGal6.2bit SEQ2_LEN=/hive/data/genomes/galGal6/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 84m14.188s cat fb.mm10.chainGalGal6Link.txt # 101151132 bases of 2652783500 (3.813%) in intersection cat fb.mm10.chainSynGalGal6Link.txt # 70707720 bases of 2652783500 (2.665%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 galGal6) > rbest.log 2>&1 & # real 116m19.316s cat fb.mm10.chainRBest.GalGal6.txt # 79649474 bases of 2652783500 (3.002%) in intersection # and for the swap: mkdir /hive/data/genomes/galGal6/bed/blastz.mm10.swap cd /hive/data/genomes/galGal6/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzGalGal6.2018-10-12/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 6m41.043s cat fb.galGal6.chainMm10Link.txt # 88539346 bases of 1055588482 (8.388%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` galGal6 mm10) > rbest.log 2>&1 & # real 94m11.007s cat fb.galGal6.chainRBest.Mm10.txt # 79474812 bases of 1055588482 (7.529%) in intersection ######################################################################### # LASTZ mouse/mm10 Minke whale/bosTau9 - (DONE - 2018-11-08 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 cd /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 printf '# mouse vs cow BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_M=254 # default BLASTZ_Q score matrix: # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # TARGET: mouse mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=40000000 SEQ1_LIMIT=2 SEQ1_LAP=10000 # QUERY: cow bosTau9 SEQ2_DIR=/hive/data/genomes/bosTau9/bosTau9.2bit SEQ2_LEN=/hive/data/genomes/bosTau9/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LIMIT=10 SEQ2_LAP=0 BASE=/hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08 TMPDIR=/dev/shm ' > DEF # << happy emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 & # real 211m46.258s cat fb.mm10.chainBosTau9Link.txt # 703580224 bases of 2652783500 (26.522%) in intersection cat fb.mm10.chainSynBosTau9Link.txt # 659095603 bases of 2652783500 (24.845%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 bosTau9) > rbest.log 2>&1 & # real 214m24.819s cat fb.mm10.chainRBest.BosTau9.txt # 667950653 bases of 2652783500 (25.179%) in intersection # and for the swap: mkdir /hive/data/genomes/bosTau9/bed/blastz.mm10.swap cd /hive/data/genomes/bosTau9/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzBosTau9.2018-11-08/DEF \ -swap -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 & # real 41m22.962s cat fb.bosTau9.chainMm10Link.txt # 695248613 bases of 2715853792 (25.600%) in intersection cat fb.bosTau9.chainSynMm10Link.txt # 660591041 bases of 2715853792 (24.324%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` bosTau9 mm10) > rbest.log 2>&1 & # real 204m36.465s cat fb.bosTau9.chainRBest.Mm10.txt # 667305554 bases of 2715853792 (24.571%) in intersection ######################################################################### 2018-11-10: import of UCSC GENCODE group processing of GENCODE VM19 (markd) # not being push to the RR # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM19 pushd /hive/data/genomes/mm10/bed/gencodeVM19 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M19 94 'Oct 2018' ## only if being pushed to RR: (skipped) # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: (SKIPPED) # edit all.joiner to add ~/tmp/gencodeVM19.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM19 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ############################################################################## # LASTZ mouse/mm10 vs. Japanese quail/cotJap2 - (DONE - 2018-11-15 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 cd /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 printf "# Mouse vs. Japanese quail BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz BLASTZ_Y=3400 BLASTZ_L=6000 BLASTZ_K=2200 BLASTZ_Q=/scratch/data/blastz/HoxD55.q # A C G T # 91 -90 -25 -100 # -90 100 -100 -25 # -25 -100 100 -90 # -100 -25 -90 91 # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Japanese quail cotJap2 SEQ2_DIR=/hive/data/genomes/cotJap2/cotJap2.2bit SEQ2_LEN=/hive/data/genomes/cotJap2/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15 TMPDIR=/dev/shm " > DEF time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > do.log 2>&1 # real 82m16.032s cat fb.mm10.chainCotJap2Link.txt # 97251364 bases of 2652783500 (3.666%) in intersection cat fb.mm10.chainSynCotJap2Link.txt # 67653818 bases of 2652783500 (2.550%) in intersection time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` mm10 cotJap2) > rbest.log 2>&1 & # real 104m58.905s cat fb.mm10.chainRBest.CotJap2.txt # 76298136 bases of 2652783500 (2.876%) in intersection # and for the swap: mkdir /hive/data/genomes/cotJap2/bed/blastz.mm10.swap cd /hive/data/genomes/cotJap2/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzCotJap2.2018-11-15/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 6m37.873s cat fb.cotJap2.chainMm10Link.txt # 82592561 bases of 917263224 (9.004%) in intersection cat fb.cotJap2.chainSynMm10Link.txt # 66583746 bases of 917263224 (7.259%) in intersection # mistakenly started this on ku, it failed at the download step since # it could not see the /gbdb/mm10/ hierarchy: time (doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` cotJap2 mm10) > rbest.log 2>&1 & # real 79m48.767s # continue on hgwdev time (doRecipBest.pl -load -workhorse=hgwdev -continue=download -buildDir=`pwd` cotJap2 mm10) > rbest.download.log 2>&1 & # real 1m40.970s cat fb.cotJap2.chainRBest.Mm10.txt # 76078816 bases of 917263224 (8.294%) in intersection ############################################################################## 2018-11-30: import of UCSC GENCODE group processing of GENCODE VM20 prerelease (markd) # This is a prerelease for testing and is *not* to pushed until the final release. # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM20 pushd /hive/data/genomes/mm10/bed/gencodeVM20 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M20 95 'Dec 2018' ## only if being pushed to RR: (skipped) # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: (SKIPPED) # edit all.joiner to add ~/tmp/gencodeVM20.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM20 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 ######################################################################### 2019-01-17: tabula muris track (max) # download 7Tb of data from Amazon, using token, CZI pays (got token by email, via Angela Pisco, James Webber) export AWS_ACCESS_KEY_ID=xxxxx export AWS_SESSION_TOKEN=xxxxx aws s3 sync s3://czbiohub-tabula-muris/tabula_muris_bam_files/ . --delete cd ~/projects/czi/cbData/ucsc/tabulaMuris csvToTab TM_facs_metadata.csv > TM_facs_metadata.tsv cat TM_facs_metadata.csv | tr '.' '-' | csvToTab > TM_facs_metadata.fix.tsv # this is not necessary anymore, the new mm10.sizes file comes with cbTrackHub and # includes the ERCCs hgsql -N -e 'select alias,chrom from chromAlias;' mm10 > mm10.chromAlias.tab faSize ERCC92.fa -detailed > ERCC.sizes cat /hive/data/genomes/mm10/chrom.sizes ERCC.sizes > mm10ercc.sizes # the next one requires single cell browser, from https://github.com/maximilianh/cellBrowser cbTrackHub mm10 bam/ TM_facs_metadata.fix.tsv cell_ontology_class hub/ --name "TabulaMuris" ######################################################################### # LIFTOVER TO GRCm38B (DONE - 2018-03-01 - Hiram) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/blat.GRCm38B.2019-03-01 cd /hive/data/genomes/mm10/bed/blat.GRCm38B.2019-03-01 doSameSpeciesLiftOver.pl -verbose=2 \ -fileServer=hgwdev \ -debug -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/mm10/jkStuff/mm10.11.ooc \ mm10 GRCm38B doSameSpeciesLiftOver.pl -verbose=2 \ -debug -fileServer=hgwdev \ -query2Bit=/hive/data/genomes/mm10/mm10.2bit \ -querySizes=/hive/data/genomes/mm10/chrom.sizes \ -target2Bit=/hive/data/genomes/GRCm38B/GRCm38B.2bit \ -targetSizes=/hive/data/genomes/GRCm38B/chrom.sizes \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/mm10/mm10.11.ooc mm10 GRCm38B time (doSameSpeciesLiftOver.pl -verbose=2 \ -fileServer=hgwdev \ -query2Bit=/hive/data/genomes/mm10/mm10.2bit \ -querySizes=/hive/data/genomes/mm10/chrom.sizes \ -target2Bit=/hive/data/genomes/GRCm38B/GRCm38B.2bit \ -targetSizes=/hive/data/genomes/GRCm38B/chrom.sizes \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ -ooc=/hive/data/genomes/mm10/mm10.11.ooc \ mm10 GRCm38B) > doLiftOverToGRCm38B.log 2>&1 # real 156m50.777s # see if the liftOver menus function in the browser from mm10 to GRCm38B ######################################################################### ############################################################################# # hgPal downloads (rebuilt knownGene and knownCanonical 2019-04-01 braney ) ssh hgwdev mkdir /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc18 cd /hive/data/genomes/mm10/bed/multiz60way/pal.ucsc18 cat ../species.list | tr '[ ]' '[\n]' > order.list export mz=multiz60way export gp=knownGene export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x ./$gp.jobs > $gp.jobs.log 2>&1 & # real 59m23.279s time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 1m35.590s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 7m46.538s export mz=multiz60way export gp=knownGene export db=mm10 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments rm -rf $pd mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz rm -rf exonAA exonNuc cd /hive/data/genomes/mm10/bed/multiz60way/pal export mz=multiz60way export gp=ncbiRefSeq export db=mm10 export I=0 mkdir exonAA exonNuc for C in `sort -nk2 ../../../chrom.sizes | cut -f1` do I=`echo $I | awk '{print $1+1}'` echo "mafGene -chrom=$C -exons -noTrans $db $mz $gp order.list stdout | gzip -c > exonNuc/$C.exonNuc.fa.gz &" echo "mafGene -chrom=$C -exons $db $mz $gp order.list stdout | gzip -c > exonAA/$C.exonAA.fa.gz &" if [ $I -gt 6 ]; then echo "date" echo "wait" I=0 fi done > $gp.jobs echo "date" >> $gp.jobs echo "wait" >> $gp.jobs time sh -x $gp.jobs > $gp.jobs.log 2>&1 # real 126m0.688s export mz=multiz60way export gp=ncbiRefSeq export db=mm10 time zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz # real 2m56.817s time zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz # real 14m8.080s rm -rf exonAA exonNuc # we're only distributing exons at the moment export mz=multiz60way export gp=ncbiRefSeq export db=mm10 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz ### And knownCanonical cd /hive/data/genomes/mm10/bed/multiz60way/pal export mz=multiz60way export gp=knownCanonical export db=mm10 mkdir exonAA exonNuc knownCanonical time cut -f1 ../../../chrom.sizes | while read C do echo $C 1>&2 hgsql mm10 -N -e "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$C'" > knownCanonical/$C.known.bed done # real 0m15.897s ls knownCanonical/*.known.bed | while read F do if [ -s $F ]; then echo $F | sed -e 's#knownCanonical/##; s/.known.bed//' fi done | while read C do echo "date" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons -noTrans $db $mz knownGene order.list stdout | \ gzip -c > exonNuc/$C.exonNuc.fa.gz" echo "mafGene -geneBeds=knownCanonical/$C.known.bed -exons $db $mz knownGene order.list stdout | \ gzip -c > exonAA/$C.exonAA.fa.gz" done > $gp.$mz.jobs time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 # 267m58.813s rm *.known.bed export mz=multiz60way export gp=knownCanonical export db=mm10 zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz & zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz & # about 6 minutes rm -rf exonAA exonNuc export mz=multiz60way export gp=knownCanonical export db=mm10 export pd=/usr/local/apache/htdocs-hgdownload/goldenPath/$db/$mz/alignments mkdir -p $pd ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz cd $pd md5sum *.fa.gz > md5sum.txt ############################################################################## 2019-04-08: import of UCSC GENCODE group processing of GENCODE VM20 (markd) # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM21 pushd /hive/data/genomes/mm10/bed/gencodeVM21 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M21 96 'Apr 2019' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM21.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM21 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 # commit all # if pushing public, add ticket and MARK QA READY ######################################################################### 2019-07-03: import of UCSC GENCODE group processing of GENCODE VM22 (markd) # Replaced import of pre-release # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM22 pushd /hive/data/genomes/mm10/bed/gencodeVM22 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& ## gencode-cmp.tsv check to see if sizes make sense # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M22 97 'June 2019' ## only if being pushed to RR: # Update mouse/mm10/wgEncodeGencodeSuper.html # Update 'Release Notes' to describe new release. # edit mouse/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM22.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM22 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck db=mm10 # commit all and make push request, the file tables.lst will have the # list of tables for the push request. cd ~/kent/src/hg/makeDb/trackDb make alpha DBS=mm10 # commit all # if pushing public, add ticket and MARK QA READY ############################################################################## # LASTZ Rat regenRn0 (DONE - 2019-07-01 - Jonathan) mkdir /hive/data/genomes/mm10/bed/lastzRegenRn0.2019-07-01 cd /hive/data/genomes/mm10/bed/lastzRegenRn0.2019-07-01 printf '# rat vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/scratch/data/mm10/mm10.2bit SEQ1_LEN=/scratch/data/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rat RegenRn0 SEQ2_DIR=/hive/data/genomes/regenRn0/regenRn0.2bit SEQ2_LEN=/hive/data/genomes/regenRn0/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzRegenRn0.2019-07-01 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10RegenRn0 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -noDbNameCheck -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=medium) > do.log 2>&1 # real 196m22.733s cat fb.mm10.chainRegenRn0Link.txt # 1843678500 bases of 2652783500 (69.500%) in intersection cat fb.mm10.chainSynRegenRn0Link.txt # 1720395177 bases of 2652783500 (64.852%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 regenRn0) > rbest.log 2>&1 & # real 494m43.241s cat fb.mm10.chainRBest.RegenRn0.txt # 1694384084 bases of 2652783500 (63.872%) in intersection mkdir /hive/data/genomes/regenRn0/bed/blastz.mm10.swap cd /hive/data/genomes/regenRn0/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRegenRn0.2019-07-01/DEF \ -swap -syntenicNet -noDbNameCheck \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=medium) > swap.log 2>&1 # real 106m31.449s cat fb.regenRn0.chainMm10Link.txt # 1803664991 bases of 2534810853 (71.156%) in intersection cat fb.regenRn0.chainSynMm10Link.txt # 1712372147 bases of 2534810853 (67.554%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` regenRn0 mm10) > rbest.log 2>&1 # real 536m51.292s cat fb.regenRn0.chainRBest.Mm10.txt # 1695272967 bases of 2534810853 (66.880%) in intersection ############################################################################## # LASTZ Rhesus rheMac10 (DONE - 2019-07-03 - Hiram) mkdir /hive/data/genomes/mm10/bed/lastzRheMac10.2019-07-03 cd /hive/data/genomes/mm10/bed/lastzRheMac10.2019-07-03 printf '# rhesus vs mouse BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.00/bin/lastz # TARGET: Mouse Mm10 SEQ1_DIR=/hive/data/genomes/mm10/mm10.2bit SEQ1_LEN=/hive/data/genomes/mm10/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Rhesus RheMac10 SEQ2_DIR=/hive/data/genomes/rheMac10/rheMac10.2bit SEQ2_LEN=/hive/data/genomes/rheMac10/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=500 BASE=/hive/data/genomes/mm10/bed/lastzRheMac10.2019-07-03 TMPDIR=/dev/shm ' > DEF # establish a screen to control this job screen -S mm10RheMac10 time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -syntenicNet -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > do.log 2>&1 # real 211m21.922s cat fb.mm10.chainRheMac10Link.txt # 923559693 bases of 2652783500 (34.815%) in intersection cat fb.mm10.chainSynRheMac10Link.txt # 878479553 bases of 2652783500 (33.115%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` mm10 rheMac10) > rbest.log 2>&1 & # real 315m43.465s cat fb.mm10.chainRBest.RheMac10.txt # 879885863 bases of 2652783500 (33.168%) in intersection mkdir /hive/data/genomes/rheMac10/bed/blastz.mm10.swap cd /hive/data/genomes/rheMac10/bed/blastz.mm10.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/mm10/bed/lastzRheMac10.2019-07-03/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium) > swap.log 2>&1 # real 52m48.045s cat fb.rheMac10.chainMm10Link.txt # 918551088 bases of 2936892733 (31.276%) in intersection cat fb.rheMac10.chainSynMm10Link.txt # 876230433 bases of 2936892733 (29.835%) in intersection time (doRecipBest.pl -workhorse=hgwdev -load -buildDir=`pwd` rheMac10 mm10) > rbest.log 2>&1 # real 303m40.303s cat fb.rheMac10.chainRBest.Mm10.txt # 878542993 bases of 2936892733 (29.914%) in intersection ############################################################################## 2019-08-30: import of UCSC GENCODE group processing of GENCODE VM23 (markd) # PRE-RELEASE # edit hg/makeDb/outside/gencode/gencodeLoad.mk to set release and ensembl versions # download, build and load tables mkdir -p /hive/data/genomes/mm10/bed/gencodeVM23 pushd /hive/data/genomes/mm10/bed/gencodeVM23 (time nice make -j 10 -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk) >&build.1.out& # compare tables from previous release to see if number changed makes # sense. Results are in gencode-cmp.tsv # generate trackDb and joiner blurb pushd ~/kent/src/hg/makeDb/trackDb ../../makeDb/outside/gencode/gencodeGenerateTrackDbs mm10 M23 98 'Sept 2019' # If being pushed public, update 'Release Notes' in # human/mm10/wgEncodeGencodeSuper.html # edit human/mm10/trackDb.gencode.ra to add new .ra file include make DBS=mm10 ## only if being pushed to RR: # edit all.joiner to add ~/tmp/gencodeVM23.joiner # verify with: pushd /hive/data/genomes/mm10/bed/gencodeVM23 make -f ~/kent/src/hg/makeDb/outside/gencode/gencodeLoad.mk joinerCheck # commit all # if pushing public: add ticket and MARK QA READY + # + #########################################################################