77740770d513278f03c8cb1eac5f02c83c6c26c4 gperez2 Thu Feb 10 16:40:31 2022 -0800 Correcting makedoc location for taeGut2 vs. geoFor1 lastz/chain/net run for user, refs #28840 diff --git src/hg/makeDb/doc/taeGut2.txt src/hg/makeDb/doc/taeGut2.txt index 8b41382..b8ea2a2 100644 --- src/hg/makeDb/doc/taeGut2.txt +++ src/hg/makeDb/doc/taeGut2.txt @@ -1,902 +1,1003 @@ # for emacs: -*- mode: sh; -*- # DATE: 08-Feb-2013 # ORGANISM: Taeniopygia guttata # TAXID: 59729 # ASSEMBLY LONG NAME: Taeniopygia_guttata-3.2.4 # ASSEMBLY SHORT NAME: Taeniopygia_guttata-3.2.4 # ASSEMBLY SUBMITTER: Washington University Genome Sequencing Center # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000151805.2 # FTP-RELEASE DATE: 12-Feb-2013 # http://www.ncbi.nlm.nih.gov/genome/367 # http://www.ncbi.nlm.nih.gov/genome/assembly/524908 # http://www.ncbi.nlm.nih.gov/bioproject/17289 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=ABQF01 # Genome Coverage : 5.5x PCAP v. 2008 # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=59729 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/ ########################################################################## # Download sequence (DONE - 2013-06-05 - Hiram) mkdir -p /hive/data/genomes/taeGut2/genbank cd /hive/data/genomes/taeGut2/genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Taeniopygia_guttata/Taeniopygia_guttata-3.2.4/ # real real 22m54.139s # verify the size of the sequence here: faSize Primary_Assembly/assembled_chromosomes/FASTA/chr*.fa.gz \ Primary_Assembly/unlocalized_scaffolds/FASTA/chr*.unlocalized.scaf.fa.gz \ Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower) # in 37095 sequences in 69 files # Total size: mean 33215.2 sd 1455020.7 # min 783 (gi|197861094|gb|ABQF01053133.1|) # max 156412533 (gi|209446643|gb|CM000518.1|) median 3541 ########################################################################## # fixup names for UCSC standards (DONE - 2013-06-05 - Hiram) mkdir /hive/data/genomes/taeGut2/ucsc cd /hive/data/genomes/taeGut2/ucsc ######################## Assembled Chromosomes cat << '_EOF_' > ucscCompositeAgp.pl #!/bin/env perl use strict; use warnings; my %accToChr; open (FH, "<../genbank/Primary_Assembly/assembled_chromosomes/chr2acc") or die "can not read Primary_Assembly/assembled_chromosomes/chr2acc"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $accToChr{$acc} = $chrN; } close (FH); foreach my $acc (keys %accToChr) { my $chrN = $accToChr{$acc}; print "$acc $accToChr{$acc}\n"; open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/AGP/chr${chrN}.comp.agp.gz|") or die "can not read chr${chrN}.comp.agp.gz"; open (UC, ">chr${chrN}.agp") or die "can not write to chr${chrN}.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/^$acc/chr${chrN}/; print UC $line; } } close (FH); close (UC); open (FH, "zcat ../genbank/Primary_Assembly/assembled_chromosomes/FASTA/chr${chrN}.fa.gz|") or die "can not read chr${chrN}.fa.gz"; open (UC, ">chr${chrN}.fa") or die "can not write to chr${chrN}.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { printf UC ">chr${chrN}\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x ucscCompositeAgp.pl time ./ucscCompositeAgp.pl # real 0m16.041s gzip *.fa *.agp & ######################## Unplaced scaffolds # verify we don't have any .acc numbers different from .1 zcat \ ../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz \ | cut -f1 | grep -v '^#' \ | sed -e 's/[A-Z0-9]*//i' | sort | uniq -c # 81153 .1 cat << '_EOF_' > unplaced.pl #!/bin/env perl use strict; use warnings; my $agpFile = "../genbank/Primary_Assembly/unplaced_scaffolds/AGP/unplaced.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chrUn.agp") or die "can not write to chrUn.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { $line =~ s/\.1//; printf UC "chrUn_%s", $line; } } close (FH); close (UC); open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chrUn.fa") or die "can not write to chrUn.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; $line =~ s/.*gb\|//; $line =~ s/\.1. .*//; printf UC ">chrUn_$line\n"; } else { print UC $line; } } close (FH); close (UC); '_EOF_' # << happy emacs chmod +x unplaced.pl time ./unplaced.pl # real 0m5.659s gzip chrUn.agp chrUn.fa & # make sure none of the names got to be over 31 characers long: zcat chrUn.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($1)}' | sort -rn | head -1 # 18 # Unlocalized scaffolds cat << '_EOF_' > unlocalized.pl #!/bin/env perl use strict; use warnings; my %accToChr; my %chrNames; open (FH, "<../genbank/Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf") or die "can not read Primary_Assembly/unlocalized_scaffolds/unlocalized.chr2scaf"; while (my $line = <FH>) { next if ($line =~ m/^#/); chomp $line; my ($chrN, $acc) = split('\s+', $line); $acc =~ s/\.1$//; $accToChr{$acc} = $chrN; $chrNames{$chrN} += 1; } close (FH); foreach my $chrN (keys %chrNames) { my $agpFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/AGP/chr$chrN.unlocalized.scaf.agp.gz"; my $fastaFile = "../genbank/Primary_Assembly/unlocalized_scaffolds/FASTA/chr$chrN.unlocalized.scaf.fa.gz"; open (FH, "zcat $agpFile|") or die "can not read $agpFile"; open (UC, ">chr${chrN}_random.agp") or die "can not write to chr${chrN}_random.agp"; while (my $line = <FH>) { if ($line =~ m/^#/) { print UC $line; } else { chomp $line; my (@a) = split('\t', $line); my $acc = $a[0]; $acc =~ s/\.1//; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC "%s", $ucscName; for (my $i = 1; $i < scalar(@a); ++$i) { printf UC "\t%s", $a[$i]; } printf UC "\n"; } } close (FH); close (UC); printf "chr%s\n", $chrN; open (FH, "zcat $fastaFile|") or die "can not read $fastaFile"; open (UC, ">chr${chrN}_random.fa") or die "can not write to chr${chrN}_random.fa"; while (my $line = <FH>) { if ($line =~ m/^>/) { chomp $line; my $acc = $line; $acc =~ s/.*gb\|//; $acc =~ s/\.1. .*//; $acc =~ s/\.1//; die "ERROR: chrN $chrN not correct for $acc" if ($accToChr{$acc} ne $chrN); my $ucscName = "chr${chrN}_${acc}_random"; printf UC ">$ucscName\n"; } else { print UC $line; } } close (FH); close (UC); } '_EOF_' # << happy emacs chmod +x unlocalized.pl time ./unlocalized.pl # real 0m2.140s gzip *.fa *.agp # verify nothing lost from original: time faSize *.fa.gz # real 0m32.541s # 1232118738 bases (9270900 N's 1222847838 real 1222847838 upper 0 lower) # in 37095 sequences in 69 files # Total size: mean 33215.2 sd 1455020.7 min 783 (chr5_ABQF01053133_random) # max 156412533 (chr2) median 3541 # make sure none of the names got to be over 31 characers long: zcat *.agp.gz | grep -v "^#" | cut -f1 | awk '{print length($0)}' \ | sort -rn | uniq -c | head ########################################################################## # Initial makeGenomeDb.pl (DONE - 2013-06-05 - Hiram) cd /hive/data/genomes/taeGut2 cat << '_EOF_' > taeGut2.config.ra # Config parameters for makeGenomeDb.pl: db taeGut2 clade vertebrate genomeCladePriority 68 scientificName Taeniopygia guttata commonName Zebra finch assemblyDate Feb. 2013 assemblyLabel Washington University School of Medicine assemblyShortLabel WashU taeGut324 orderKey 436 mitoAcc NC_007897 fastaFiles /cluster/data/taeGut2/ucsc/chr*.fa.gz agpFiles /cluster/data/taeGut2/ucsc/chr*.agp.gz # qualFiles none dbDbSpeciesDir zebraFinch photoCreditURL http://www.physci.ucla.edu/html/arnold.htm photoCreditName UCLA Department of Physiological Science ncbiGenomeId 367 ncbiAssemblyId 524908 ncbiAssemblyName Taeniopygia_guttata-3.2.4 ncbiBioProject 17289 genBankAccessionID GCA_000151805.2 taxId 59729 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp taeGut2.config.ra > agp.log 2>&1 # verify no problem: tail -1 agp.log # *** All done! (through the 'agp' step) time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -continue=db taeGut2.config.ra > db.log 2>&1 # real 10m29.132s # # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/taeGut2.unmasked.2bit /gbdb/taeGut2/taeGut2.2bit # browser should function now in sandbox # trackDb files here: # /hive/data/genomes/taeGut2/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/gibbon/taeGut2/ # into source tree # now browser should function on hgwdev ######################################################################### # running repeat masker (DONE - 2013-06-05 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/repeatMasker cd /hive/data/genomes/taeGut2/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek taeGut2 > do.log 2>&1 & # real 220m20.455s cat faSize.rmsk.txt # 1232135591 bases (9270900 N's 1222864691 real 1125277738 upper # 97586953 lower) in 37096 sequences in 1 files # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random) # max 156412533 (chr2) median 3541 # %7.92 masked total, %7.98 masked real egrep -i "versi|relea" do.log # January 10 2013 (open-4-0-0) version of RepeatMasker # CC RELEASE 20120418; time featureBits -countGaps taeGut2 rmsk # 97640353 bases of 1232135591 (7.924%) in intersection # real 0m16.657s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2013-06-05 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/simpleRepeat cd /hive/data/genomes/taeGut2/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ taeGut2 > do.log 2>&1 # about 14m30s cat fb.simpleRepeat # 25800814 bases of 1222864691 (2.110%) in intersection # considering rmsk %8 vs. WM %20 and this is a small genome, # use the rmsk result in order to have the classifications from # that available # add to rmsk after it is done: cd /hive/data/genomes/taeGut2 twoBitMask taeGut2.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed taeGut2.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa taeGut2.2bit stdout | faSize stdin > faSize.taeGut2.2bit.txt cat faSize.taeGut2.2bit.txt # 1232135591 bases (9270900 N's 1222864691 real 1123697742 upper # 99166949 lower) in 37096 sequences in 1 files # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random) # max 156412533 (chr2) median 3541 # %8.05 masked total, %8.11 masked real rm /gbdb/taeGut2/taeGut2.2bit ln -s `pwd`/taeGut2.2bit /gbdb/taeGut2/taeGut2.2bit ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-06-05 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/gap cd /hive/data/genomes/taeGut2/bed/gap time nice findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../taeGut2.unmasked.2bit > findMotif.txt 2>&1 # real 1m2.760s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits taeGut2 -not gap -bed=notGap.bed # 1222864691 bases of 1222864691 (100.000%) in intersection # real 0m9.762s time featureBits taeGut2 allGaps.bed notGap.bed -bed=new.gaps.bed # 0 bases of 1222864691 (0.000%) in intersection # real 19m53.371s # are there non-bridged gaps here: hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c # 326 no # 87384 yes ########################################################################## ## WINDOWMASKER (DONE- 2013-06-05 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/windowMasker cd /hive/data/genomes/taeGut2/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev taeGut2 > do.log 2>&1 & # real 66m52.863s # Masking statistics faSize.taeGut2.cleanWMSdust.txt # 1232135591 bases (9270900 N's 1222864691 real 973147255 upper # 249717436 lower) in 37096 sequences in 1 files # Total size: mean 33214.8 sd 1455001.1 min 783 (chr5_ABQF01053133_random) # max 156412533 (chr2) median 3541 # %20.27 masked total, %20.42 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps taeGut2 rmsk windowmaskerSdust # 54554911 bases of 1232135591 (4.428%) in intersection ######################################################################## # cpgIslands - (DONE - 2013-06-06 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/cpgIslands cd /hive/data/genomes/taeGut2/bed/cpgIslands time doCpgIslands.pl taeGut2 > do.log 2>&1 # real 17m40.265s cat fb.taeGut2.cpgIslandExt.txt # 16473980 bases of 1222864691 (1.347%) in intersection ############################################################################# # CPG Islands Unmasked track mkdir /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked cd /hive/data/genomes/taeGut2/bed/cpgIslandsUnmasked time doCpgIslands.pl -buildDir=`pwd` -bigClusterHub=ku \ -tableName=cpgIslandExtUnmasked -dbHost=hgwdev -smallClusterHub=ku \ -workhorse=hgwdev \ -maskedSeq=/hive/data/genomes/taeGut2/taeGut2.unmasked.2bit \ taeGut2 > do.log 2>&1 ############################################################################# ######################################################################### # genscan - (DONE - 2013-06-06 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/genscan cd /hive/data/genomes/taeGut2/bed/genscan time doGenscan.pl taeGut2 > do.log 2>&1 # real 47m7.823s # XXX failed four jobs, running manually on hgwdev: chr9 chr1A chr3 chr2 # with -window=2000000 - all bug chr9 finished # real 57m16.383s # user 133m46.445s # sys 2m55.584s # running chr9 with -window=1000000 # real 10m14.291s # user 9m59.182s # sys 0m13.720s time doGenscan.pl -continue=makeBed taeGut2 > makeBed.log 2>&1 # real 30m4.119s cat fb.taeGut2.genscan.txt # 52986693 bases of 1222864691 (4.333%) in intersection cat fb.taeGut2.genscanSubopt.txt # 28313661 bases of 1222864691 (2.315%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-06 - Hiram) # Use -repMatch=500, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1222864691 / 2897316137 \) \* 1024 # ( 1222864691 / 2897316137 ) * 1024 = 432.197725 # round up to 500 (taeGut1 used 1024) cd /hive/data/genomes/taeGut2 time blat taeGut2.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/taeGut2.11.ooc -repMatch=500 # Wrote 12162 overused 11-mers to jkStuff/taeGut2.11.ooc # real 0m32.994s # there are non-bridged gaps, therefore, a lift file is needed for genbank hgsql -N -e "select bridge from gap;" taeGut2 | sort | uniq -c # 326 no # 87384 yes cd /hive/data/genomes/taeGut2/jkStuff gapToLift taeGut2 taeGut2.nonBridged.lift -bedFile=taeGut2.nonBridged.bed # largest non-bridged contig: awk '{print $3-$2,$0}' taeGut2.nonBridged.bed | sort -nr | head # 56928224 chr5 4758199 61686423 chr5.07 ######################################################################### # AUTO UPDATE GENBANK (DONE - 2013-03-08 - Pauline) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Taeniopygia guttata 5086 92144 987 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add taeGut2 just after ce2 # taeGut2 - Zebra Finch taeGut2.serverGenome = /hive/data/genomes/taeGut2/taeGut2.2bit taeGut2.clusterGenome = /hive/data/genomes/taeGut2/taeGut2.2bit taeGut2.ooc = /hive/data/genomes/taeGut2/jkStuff/taeGut2.11.ooc taeGut2.align.unplacedChroms = chrUn_*,chr*_random taeGut2.lift = /hive/data/genomes/taeGut2/jkStuff/taeGut2.nonBridged.lift taeGut2.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} taeGut2.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} taeGut2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} taeGut2.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} taeGut2.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} taeGut2.refseq.mrna.native.load = yes taeGut2.refseq.mrna.xeno.load = yes taeGut2.genbank.mrna.xeno.load = yes taeGut2.downloadDir = taeGut2 taeGut2.upstreamGeneTbl = refGene # end of section added to etc/genbank.conf git commit -m "adding taeGut2 Zebra finch refs #10190" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S taeGut2 # long running job managed in screen cd /cluster/data/genbank time ./bin/gbAlignStep -initial taeGut2 & # logFile: var/build/logs/2013.06.18-11:56:48.taeGut2.initalign.log # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad taeGut2 & # logFile: var/dbload/hgwdev/logs/2013.06.08-18:59:10.dbload.log # real 29m9.092s # enable daily alignment and update of hgwdev (DONE - 2013-03-08 - Pauline) cd ~/kent/src/hg/makeDb/genbank git pull # add taeGut2 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added taeGut2. refs #10190" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position similar to taeGut1 (DONE - 2013-06-19 - Hiram) hgsql -e \ 'update dbDb set defaultPos="chrZ:56283356-56343168" where name="taeGut2";' \ hgcentraltest ######################################################################### # BLATSERVERS ENTRY (DONE - 2013-06-19 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("taeGut2", "blat4c", "17848", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("taeGut2", "blat4c", "17849", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # reset dbDb description to be like the previous nomLeu # (DONE - 2013-03-21 - Hiram) hgsql hgcentraltest -e 'update dbDb set description = "Oct. 2012 (GGSC Nleu3.0/taeGut2)"" where name = "taeGut2";' \ | hgsql -h genome-testdb hgcentraltest ############################################################################ # construct liftOver from taeGut1 to taeGut2 (DONE - 2013-06-19 - Hiram) # documentation for this step is in taeGut1 - been done ############################################################################ # downloads and pushQ entry (DONE - 2014-04-16 - Hiram) # after adding taeGut2 to the all.joiner file and verifying that # joinerCheck is clean (i.e. run joinerCheck w -times and -keys flags # to make sure there are no errors), can construct the downloads: cd /hive/data/genomes/taeGut2 time makeDownloads.pl -workhorse=hgwdev taeGut2 # real 14m53.730s mkdir /hive/data/genomes/taeGut2/pushQ cd /hive/data/genomes/taeGut2/pushQ time makePushQSql.pl taeGut2 2> stderr.txt > taeGut2.sql # real 2m4.300s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/taeGut2/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/taeGut2/wib/quality.wib # WARNING: hgwdev does not have /gbdb/taeGut2/bbi/quality.bw # WARNING: taeGut2 does not have seq # WARNING: taeGut2 does not have extFile # WARNING: taeGut2 does not have estOrientInfo scp -p taeGut2.sql qateam@hgwbeta:/tmp/ ssh qateam@hgwbeta hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/taeGut2.sql" ############################################################################ # BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2013-10-18,25 - Hiram) # use a screen to manage this multi-day job screen -S melGal1 mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18 cd /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18 cat << '_EOF_' > DEF # Zebra finch vs. Turkey BLASTZ_M=50 # TARGET: Zebra finch TaeGut2 SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=100 # QUERY: Turkey MelGal1 SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit SEQ2_LEN=/scratch/data/melGal1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=100 SEQ2_LAP=0 BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18 TMPDIR=/dev/shm '_EOF_' # << this line keeps emacs coloring happy # screen time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 # real 4063m8.278s time nice -n +19 doBlastzChainNet.pl -verbose=2 \ `pwd`/DEF \ -continue=cat -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > cat.log 2>&1 # real 66m17.485s cat fb.taeGut2.chainMelGal1Link.txt # 781765606 bases of 1222864691 (63.929%) in intersection cd /hive/data/genomes/taeGut2/bed ln -s lastzMelGal1.2013-10-18 lastz.melGal1 # and then swap mkdir /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap cd /hive/data/genomes/melGal1/bed/blastz.taeGut2.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/taeGut2/bed/lastzMelGal1.2013-10-18/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 # real 43m24.666s cat fb.melGal1.chainTaeGut2Link.txt # 661376423 bases of 935922386 (70.666%) in intersection cd /hive/data/genomes/melGal1/bed ln -s blastz.taeGut2.swap lastz.taeGut2 ############################################################################ # lifting ensGene from taeGut1 (DONE - 2011-03-23 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/ensGene.73 cd /hive/data/genomes/taeGut2/bed/ensGene.73 # start with taeGut1 ensGene hgsql -N -e "select * from ensGene;" taeGut1 | cut -f2- > taeGut1.ensGene.gp # lift over to taeGut2 liftOver -genePred taeGut1.ensGene.gp \ /hive/data/genomes/taeGut1/bed/liftOver/taeGut1ToTaeGut2.over.chain.gz \ taeGut2.ensGene.gp unMapped.txt # check to see which genes have gone bad genePredCheck -db=taeGut2 taeGut2.ensGene.gp > badOnes.txt 2>&1 # checked: 19226 failed: 2 awk '{print $3}' badOnes.txt | sort -u | grep -v failed > toRemove.txt # remove the bad ones cp -p taeGut2.ensGene.gp taeGut2.unfiltered.ensGene.gp for N in `cat toRemove.txt` do rm -f t.gp grep -v "${N}" taeGut2.ensGene.gp > t.gp rm -f taeGut2.ensGene.gp mv t.gp taeGut2.ensGene.gp done genePredCheck -db=taeGut2 taeGut2.ensGene.gp # checked: 19225 failed: 0 # OK to load this since all are OK now hgLoadGenePred -genePredExt taeGut2 ensGene taeGut2.ensGene.gp genePredCheck -db=taeGut2 ensGene # checked: 19225 failed: 0 # use the ensPep and ensGtp from taeGut1, but remove the bad ones zcat /hive/data/genomes/taeGut1/bed/ensGene.73/download/Taeniopygia_guttata.taeGut3.2.4.73.pep.all.fa.gz \ | sed -e 's/^>.* transcript:/>/; s/ CCDS.*$//; s/ gene_biotype.*$//;' | gzip > ensPep.txt.gz zcat ensPep.txt.gz \ | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \ | sed -e '/^$/d; s/*$//' | sort > ensPep.taeGut1.fa.tab cp -p ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab for N in `cat toRemove.txt` do rm -f t.fa.tab grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab rm -f ensPep.taeGut2.fa.tab mv t.fa.tab ensPep.taeGut2.fa.tab echo "'${N}'" done wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab # 18204 ensPep.taeGut1.fa.tab # 18203 ensPep.taeGut2.fa.tab awk '{print $1}' taeGut1.ensGene.gp | sort -u > taeGut1.name.list awk '{print $1}' taeGut2.ensGene.gp | sort -u > taeGut2.name.list cp -p /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensGtp.tab \ ./taeGut1.ensGtp.tab cp -p taeGut1.ensGtp.tab taeGut2.ensGtp.tab comm -23 taeGut1.name.list taeGut2.name.list | while read N do rm -f t.gtp.tab grep -v "${N}" taeGut2.ensGtp.tab > t.gtp.tab rm -f taeGut2.ensGtp.tab mv t.gtp.tab taeGut2.ensGtp.tab echo "${N}" done wc -l *ensGtp* # 19334 taeGut1.ensGtp.tab # 19225 taeGut2.ensGtp.tab comm -23 taeGut1.name.list taeGut2.name.list | while read N do rm -f t.fa.tab grep -v "${N}" ensPep.taeGut2.fa.tab > t.fa.tab rm -f ensPep.taeGut2.fa.tab mv t.fa.tab ensPep.taeGut2.fa.tab echo "'${N}'" done wc -l ensPep.taeGut1.fa.tab ensPep.taeGut2.fa.tab # 18204 ensPep.taeGut1.fa.tab # 18097 ensPep.taeGut2.fa.tab # now that those are clean, load them up hgPepPred taeGut2 tab ensPep ensPep.taeGut2.fa.tab hgLoadSqlTab taeGut2 ensGtp ~/kent/src/hg/lib/ensGtp.sql taeGut2.ensGtp.tab # and record version in trackVersion hgsql -e 'INSERT INTO trackVersion \ (db, name, who, version, updateTime, comment, source, dateReference) \ VALUES("taeGut2", "ensGene", "hiram", "73", now(), \ "peptides lifted from taeGut1", \ "genes lifted from taeGut1 ensGene v73", "current" );' hgFixed /cluster/bin/scripts/ensemblInfo.pl /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblToGeneName.taeGut1.tab /cluster/bin/scripts/ensemblInfo.pl -field2=source /hive/data/genomes/taeGut1/bed/ensGene.73/process/infoOut.txt | sort > ensemblSource.taeGut1.tab # default join output separates fields by space # we need tab separated files for hgLoadSqlTab # if it was join -t'\t' - it doesn't recognize \t as a tab # if it was the hidden tab character join -t'^I' it wouldn't cut and paste # so use the sed to change space into tab since we know our names do # not have space join taeGut2.name.list ensemblToGeneName.taeGut1.tab \ | sed -e 's/ /\t/g' > ensemblToGeneName.tab join taeGut2.name.list ensemblSource.taeGut1.tab \ | sed -e 's/ /\t/g' > ensemblSource.tab hgLoadSqlTab taeGut2 ensemblToGeneName \ /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblToGeneName.sql \ ensemblToGeneName.tab hgLoadSqlTab taeGut2 ensemblSource \ /hive/data/genomes/taeGut1/bed/ensGene.73/process/ensemblSource.sql \ ensemblSource.tab ######################################################################### # create ucscToINSDC name mapping (DONE - 2014-04-15 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/ucscToINSDC cd /hive/data/genomes/taeGut2/bed/ucscToINSDC # this script has been maturing over time, it is close to complete. # to find a latest copy of it: # ls -ogrt /hive/data/genomes/*/bed/ucscToINSDC/translateNames.sh cp -p /hive/data/genomes/eriEur2/bed/ucscToINSDC/translateNames.sh . ./translateNames.sh # it says: # need to find chrM accessions # so add this one: echo -e 'chrM\tNC_007897.1' >> ucscToINSDC.txt # needs to be sorted to work with join sort ucscToINSDC.txt > ucscToINSDC.tab awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes | sort \ > name.coordinate.tab join name.coordinate.tab ucscToINSDC.tab | tr '[ ]' '[\t]' > ucscToINSDC.bed cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 28 # use the 28 in this sed: sed -e "s/21/28/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab taeGut2 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords taeGut2 ucscToINSDC # should cover all bases featureBits -countGaps taeGut2 ucscToINSDC # 1232135591 bases of 1232135591 (100.000%) in intersection ############################################################################## # cytoBandIdeo - (DONE - 2014-04-15 - Hiram) mkdir /hive/data/genomes/taeGut2/bed/cytoBand cd /hive/data/genomes/taeGut2/bed/cytoBand makeCytoBandIdeo.csh taeGut2 ############################################################################## # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram) hgsql -N -e "select frag from gold;" taeGut2 | sort | head -1 ABQF01000001.1 hgsql -N -e "select frag from gold;" taeGut2 | sort | tail -2 ABQF01124805.1 NC_007897 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" taeGut2 | wc -l # 124806 hgsql -N -e "select frag from gold;" taeGut2 | egrep -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l # 124806 hgsql -N -e "select frag from gold;" taeGut2 | egrep -v -e '[AN][BC][Q_][F0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/zebraFinch/taeGut2/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][BC][Q_][F0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ######################################################################### ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## ############################################################################ # BLASTZ/CHAIN/NET Zebra finch vs Turkey (DONE - 2017-01-19,25 - Hiram) # use a screen to manage this multi-day job screen -S melGal5 mkdir /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19 cd /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19 printf '# Zebra finch vs. Turkey BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz BLASTZ_M=254 # TARGET: Zebra finch TaeGut2 SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 SEQ1_LIMIT=300 # QUERY: Turkey MelGal5 SEQ2_DIR=/scratch/data/melGal5/melGal5.2bit SEQ2_LEN=/scratch/data/melGal5/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=800 SEQ2_LAP=0 BASE=/hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19 TMPDIR=/dev/shm ' > DEF time (doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 # real 266m59.980s cat fb.taeGut2.chainMelGal5Link.txt # 807888817 bases of 1222864691 (66.065%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` taeGut2 melGal5) \ > rbest.log 2>&1 & # real 413m28.452s # and then swap mkdir /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap cd /hive/data/genomes/melGal5/bed/blastz.taeGut2.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/taeGut2/bed/lastzMelGal5.2017-01-19/DEF \ -swap -syntenicNet \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 # real 119m13.393s cat fb.melGal5.chainTaeGut2Link.txt # 706578355 bases of 1093044709 (64.643%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` melGal5 taeGut2) \ > rbest.log 2>&1 # real 354m40.975s ############################################################################ +# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1 (DONE - 2022-02-03 -Gerardo) + +# should be able to run this from anywhere, this time it was run from: + cd kent/src/hg/utils/automation + + time (~/kent/src/hg/utils/automation/pairLastz.sh \ + taeGut2 geoFor1 other other) \ + > zebraFinch_medFinch_20220202_secondTime.log 2>&1 & + # check the total time +grep -w real zebraFinch_medFinch_20220202_secondTime.log | tail -1 | sed -e 's/^/ # /;' + # real 107m34.019s + + # this zebraFinch_medFinch_20220202_secondTime.log log file happens to have a copy of the make doc, as well + # as the copy of the make doc left in the target assembly directory: +# /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/makeDoc.txt + + # this command outputs this makeDoc text: + + cat kent/src/hg/utils/automation/zebraFinch_medFinch_20220202_secondTime.log + +############################################################################## +# LASTZ Zebra finch TaeGut2 vs. Medium ground finch GeoFor1 +# (DONE - 2022-02-03 - Gerardo) + + mkdir /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02 + cd /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02 + + printf '# Medium ground finch GeoFor1 vs. Zebra finch TaeGut2 +BLASTZ=/cluster/bin/penn/lastz-distrib-1.04.03/bin/lastz + +# TARGET: Zebra finch TaeGut2 +SEQ1_DIR=/hive/data/genomes/taeGut2/taeGut2.2bit +SEQ1_LEN=/hive/data/genomes/taeGut2/chrom.sizes +SEQ1_CHUNK=20000000 +SEQ1_LAP=10000 +SEQ1_LIMIT=40 + +# QUERY: Medium ground finch GeoFor1 +SEQ2_DIR=/hive/data/genomes/geoFor1/geoFor1.2bit +SEQ2_LEN=/hive/data/genomes/geoFor1/chrom.sizes +SEQ2_CHUNK=20000000 +SEQ2_LAP=0 +SEQ2_LIMIT=100 + +BASE=/hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02 +TMPDIR=/dev/shm + +' > DEF + + time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 `pwd`/DEF -syntenicNet \ + -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=5000 -chainLinearGap=loose) > do.log 2>&1 + grep -w real do.log | sed -e 's/^/ # /;' + # real 159m10.037s + + sed -e 's/^/ # /;' fb.taeGut2.chainGeoFor1Link.txt + # 1125215195 bases of 1222864691 (92.015%) in intersection + sed -e 's/^/ # /;' fb.taeGut2.chainSynGeoFor1Link.txt + # 1024055100 bases of 1222864691 (83.742%) in intersection + + time (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ + \ + \ + taeGut2 geoFor1) > rbest.log 2>&1 + + grep -w real rbest.log | sed -e 's/^/ # /;' + + sed -e 's/^/ # /;' fb.taeGut2.chainRBest.GeoFor1.txt + + ### and for the swap + + cd /hive/data/genomes/geoFor1/bed/blastz.taeGut2.swap + + time (~/kent/src/hg/utils/automation/doBlastzChainNet.pl -swap -verbose=2 \ + /hive/data/genomes/taeGut2/bed/lastzGeoFor1.2022-02-02/DEF -swapDir=`pwd` \ + -syntenicNet -workhorse=hgwdev -smallClusterHub=hgwdev -bigClusterHub=ku \ + -chainMinScore=5000 -chainLinearGap=loose) > swap.log 2>&1 + + grep -w real swap.log | sed -e 's/^/ # /;' + # real 42m46.900s + + sed -e 's/^/ # /;' fb.geoFor1.chainTaeGut2Link.txt + # 954457151 bases of 1041286029 (91.661%) in intersection + sed -e 's/^/ # /;' fb.geoFor1.chainSynTaeGut2Link.txt + # 912902982 bases of 1041286029 (87.671%) in intersection +\ time (~/kent/src/hg/utils/automation/doRecipBest.pl -load -workhorse=hgwdev -buildDir=`pwd` \ + \ + \ + geoFor1 taeGut2) > rbest.log 2>&1 + + grep -w real rbest.log | sed -e 's/^/ # /;' + # real 64m46.642s + + sed -e 's/^/ # /;' fb.geoFor1.chainRBest.TaeGut2.txt + # 919870835 bases of 1041286029 (88.340%) in intersection + +############################################################################## + +real 107m34.019s +user 0m1.109s +sys 0m1.771s