82e20c618a9b836885c82fd2d9fdbe941d3955e9 hiram Tue Mar 2 11:02:47 2021 -0800 completed liftOvers to xenTro10 refs #24693 diff --git src/hg/makeDb/doc/xenTro7.txt src/hg/makeDb/doc/xenTro7.txt index 5f0e542..86405e2 100644 --- src/hg/makeDb/doc/xenTro7.txt +++ src/hg/makeDb/doc/xenTro7.txt @@ -1,895 +1,913 @@ # for emacs: -*- mode: sh; -*- # DATE: 26-Sep-2012 # ORGANISM: Xenopus (Silurana) tropicalis # TAXID: 8364 # ASSEMBLY LONG NAME: Xtropicalis_v7 # ASSEMBLY SHORT NAME: Xtropicalis_v7 # ASSEMBLY SUBMITTER: DOE Joint Genome Institute # ASSEMBLY TYPE: Haploid # NUMBER OF ASSEMBLY-UNITS: 1 # ASSEMBLY ACCESSION: GCA_000004195.2 # FTP-RELEASE DATE: 28-Dec-2012 # http://www.ncbi.nlm.nih.gov/genome/80 # http://www.ncbi.nlm.nih.gov/genome/assembly/515038 # http://www.ncbi.nlm.nih.gov/bioproject/12348 # http://www.ncbi.nlm.nih.gov/Traces/wgs/?val=AAMC02 # Genome Coverage : 7.44X ABI 3739 ARACHNE v. 20071016_modified # http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=8364 # rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Xenopus_tropicalis/Xtropicalis_v7/ ########################################################################## # Download sequence (DONE - 2013-02-26 - Hiram) mkdir -p /hive/data/genomes/xenTro7/genbank cd /hive/data/genomes/xenTro7/genbank time rsync -a -P \ rsync://ftp.ncbi.nlm.nih.gov/genbank/genomes/Eukaryotes/vertebrates_other/Xenopus_tropicalis/Xtropicalis_v7/ ./ # verify the size of the sequence here: faSize Primary_Assembly/unplaced_scaffolds/FASTA/unplaced.scaf.fa.gz # 1437513269 bases (71594132 N's 1365919137 real 1365919137 upper 0 lower) # in 7727 sequences in 1 files # Total size: mean 186037.7 sd 4782867.2 min 1002 (gi|431812709|gb|KB029368.1|) # max 215906545 (gi|431820428|gb|KB021649.1|) median 4920 ########################################################################## # fixup names for UCSC standards (DONE - 2013-03-25 - Hiram) cd /hive/data/genomes/xenTro7 $HOME/kent/src/hg/utils/automation/unplacedScaffolds.pl # constructs /hive/data/genomes/xenTro7/ucsc/ #-rw-rw-r-- 1 5760083 Mar 25 23:36 xenTro7.ucsc.agp #-rw-rw-r-- 1 404452385 Mar 25 23:42 xenTro7.ucsc.fa.gz #-rw-rw-r-- 1 212 Mar 25 23:43 checkAgp.result.txt # and not-needed here: cd /hive/data/genomes/xenTro7 #-rw-rw-r-- 1 359984712 Mar 25 23:43 xenTro7.unmasked.2bit rm -f xenTro7.unmasked.2bit ########################################################################## # Initial makeGenomeDb.pl (DONE - 2013-06-14 - Hiram) cd /hive/data/genomes/xenTro7 cat << '_EOF_' > xenTro7.config.ra # Config parameters for makeGenomeDb.pl: db xenTro7 clade vertebrate # genomeCladePriority 80 # this name doesn't work, the (parens) cause trouble everywhere # scientificName Xenopus (Silurana) tropicalis scientificName Xenopus tropicalis commonName X. tropicalis assemblyDate Sep. 2012 assemblyLabel US DOE Joint Genome Institute (JGI-PGF) assemblyShortLabel Xtropicalis v7 orderKey 4439 mitoAcc NC_006839 fastaFiles /cluster/data/xenTro7/ucsc/xenTro7.ucsc.fa.gz agpFiles /cluster/data/xenTro7/ucsc/xenTro7.ucsc.agp dbDbSpeciesDir xenTro photoCreditURL http://www.unc.edu/ photoCreditName UNC Chapel Hill, Chris Showell, all rights reserved ncbiGenomeId 80 ncbiAssemblyId 515038 ncbiAssemblyName Xtropicalis_v7 ncbiBioProject 12348 genBankAccessionID GCA_000004195.2 taxId 8364 '_EOF_' # << happy emacs # verify sequence and agp are OK time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -stop=agp xenTro7.config.ra > agp.log 2>&1 # verify no problem: tail -1 agp.log # *** All done! (through the 'agp' step) time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -continue=db xenTro7.config.ra > db.log 2>&1 # real 10m42.484s # failed due to species name not matching photo name. # temporarily set the name to "Xenopus tropicalis" and finish it: time makeGenomeDb.pl -workhorse=hgwdev -fileServer=hgwdev -dbHost=hgwdev \ -continue=trackDb xenTro7.config.ra > trackDb.log 2>&1 # # add the trackDb entries to the source tree, and the 2bit link: ln -s `pwd`/xenTro7.unmasked.2bit /gbdb/xenTro7/xenTro7.2bit # browser should function now in sandbox # trackDb files here: # /hive/data/genomes/xenTro7/TemporaryTrackDbCheckout/kent/src/hg/makeDb/trackDb/xenTro/xenTro7/ # into source tree # now browser should function on hgwdev user 0m0.061s sys 0m0.086s real 34m53.407s user 0m0.054s sys 0m0.067s [1]- Exit 255 time doRepeatMasker.pl -buildDir=`pwd` -noSplit -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek xenTro7 > do.log 2>&1 (wd: /hive/data/genomes/xenTro7/bed/repeatMasker) (wd now: /hive/data/genomes/xenTro7/bed/simpleRepeat) [2]+ Done time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek xenTro7 > do.log 2>&1 ######################################################################### # running repeat masker (DONE - 2013-06-20,21 - Hiram) # needed new version of RM to get this to work. The "official" # NCBI taxonomy name is "Xenopus (Silurana) tropicalis" with the (parens) # causes nothing but trouble. mkdir /hive/data/genomes/xenTro7/bed/repeatMasker cd /hive/data/genomes/xenTro7/bed/repeatMasker time doRepeatMasker.pl -buildDir=`pwd` -noSplit \ -species "Xenopus tropicalis" -bigClusterHub=swarm -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek xenTro7 > do.log 2>&1 & # real 34m46.351s time doRepeatMasker.pl -buildDir=`pwd` \ -species "Xenopus tropicalis" -bigClusterHub=swarm \ -continue=cat -dbHost=hgwdev -workhorse=hgwdev \ -smallClusterHub=encodek xenTro7 > cat.log 2>&1 & # real 23m12.039s cat faSize.rmsk.txt # 1437530879 bases (71594132 N's 1365936747 real 902757128 # upper 463179619 lower) in 7728 sequences in 1 files # Total size: mean 186015.9 sd 4782558.0 min 1002 (KB029368) # max 215906545 (KB021649) median 4921 # %32.22 masked total, %33.91 masked real egrep -i "versi|relea" do.log # RepeatMasker version open-4.0.3 # June 20 2013 (open-4-0-3) version of RepeatMasker # CC RELEASE 20130422; time featureBits -countGaps xenTro7 rmsk # 464012349 bases of 1437530879 (32.278%) in intersection # real 0m16.657s # why is it different than the faSize above ? # because rmsk masks out some N's as well as bases, the faSize count above # separates out the N's from the bases, it doesn't show lower case N's ########################################################################## # running simple repeat (DONE - 2013-06-14 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/simpleRepeat cd /hive/data/genomes/xenTro7/bed/simpleRepeat time doSimpleRepeat.pl -buildDir=`pwd` -bigClusterHub=swarm \ -dbHost=hgwdev -workhorse=hgwdev -smallClusterHub=encodek \ xenTro7 > do.log 2>&1 # real 23m24.022s cat fb.simpleRepeat # 117049533 bases of 1365936747 (8.569%) in intersection # considering rmsk %32 vs. WM %39, rmsk is good enough and like to # use the rmsk result in order to have the classifications from # that available # add to rmsk after it is done: cd /hive/data/genomes/xenTro7 twoBitMask xenTro7.rmsk.2bit \ -add bed/simpleRepeat/trfMask.bed xenTro7.2bit # you can safely ignore the warning about fields >= 13 twoBitToFa xenTro7.2bit stdout | faSize stdin > faSize.xenTro7.2bit.txt cat faSize.xenTro7.2bit.txt # 1437530879 bases (71594132 N's 1365936747 real 901765669 # upper 464171078 lower) in 7728 sequences in 1 files # Total size: mean 186015.9 sd 4782558.0 min 1002 (KB029368) # max 215906545 (KB021649) median 4921 # %32.29 masked total, %33.98 masked real rm /gbdb/xenTro7/xenTro7.2bit ln -s `pwd`/xenTro7.2bit /gbdb/xenTro7/xenTro7.2bit ########################################################################## # CREATE MICROSAT TRACK (DONE - 2015-06-22 - Hiram) ssh hgwdev mkdir /cluster/data/xenTro7/bed/microsat cd /cluster/data/xenTro7/bed/microsat awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \ ../simpleRepeat/simpleRepeat.bed > microsat.bed hgLoadBed xenTro7 microsat microsat.bed # Read 13163 elements of size 4 from microsat.bed ######################################################################### # Verify all gaps are marked, add any N's not in gap as type 'other' # (DONE - 2013-06-14 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/gap cd /hive/data/genomes/xenTro7/bed/gap time nice findMotif -motif=gattaca -verbose=4 \ -strand=+ ../../xenTro7.unmasked.2bit > findMotif.txt 2>&1 # real 1m2.760s grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed time featureBits xenTro7 -not gap -bed=notGap.bed # 1365936747 bases of 1365936747 (100.000%) in intersection # real 0m11.365s awk '{print $3-$2,$0}' notGap.bed | sort -rn > notGap.sizes.txt # largest contiguous sequence: head -1 notGap.sizes.txt | awk '{print $1}' # 671191 # minimal coverage 1 base out of that largest sequence: echo 671191 | awk '{printf "%15.10f\n", 1/(2*$1)}' | sed -e 's/ //g' # 0.0000007449 time bedIntersect -minCoverage=0.0000007449 allGaps.bed notGap.bed \ test.new.gaps.bed # real 0m0.546s # no new gaps: # -rw-rw-r-- 1 0 Jun 14 19:16 test.new.gaps.bed # if there were gaps, this is the number of bases in these new gaps: awk '{print $3-$2}' test.new.gaps.bed | ave stdin | grep total # total 8314.000000 # 0 bases of 1222864691 (0.000%) in intersection # real 19m53.371s # there are *no* non-bridged gaps here, lift file not needed for genbank hgsql -N -e "select bridge from gap;" xenTro7 | sort | uniq -c # 47422 yes ######################################################################### # cytoBandIdeo - (DONE - 2013-06-14 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/cytoBand cd /hive/data/genomes/xenTro7/bed/cytoBand makeCytoBandIdeo.csh xenTro7 ########################################################################## ## WINDOWMASKER (DONE- 2013-06-14 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/windowMasker cd /hive/data/genomes/xenTro7/bed/windowMasker time nice -n +19 doWindowMasker.pl -buildDir=`pwd` -workhorse=hgwdev \ -dbHost=hgwdev xenTro7 > do.log 2>&1 & # real 94m50.291s # Masking statistics faSize.xenTro7.cleanWMSdust.txt # 1437530879 bases (71594132 N's 1365936747 real 826364480 upper 539572267 lower) in 7728 sequences in 1 files # Total size: mean 186015.9 sd 4782558.0 min 1002 (KB029368) max 215906545 (KB021649) median 4921 # %37.53 masked total, %39.50 masked real # how much does this window masker and repeat masker overlap: featureBits -countGaps xenTro7 rmsk windowmaskerSdust \ > fb.xenTro7.rmsk.windowmaskerSdust.txt 2>&1 # 360602924 bases of 1437530879 (25.085%) in intersection ######################################################################## # cpgIslands - (DONE - 2013-06-26 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/cpgIslands cd /hive/data/genomes/xenTro7/bed/cpgIslands time doCpgIslands.pl xenTro7 > do.log 2>&1 # real 15m32.332s cat fb.xenTro7.cpgIslandExt.txt # 4641665 bases of 1365936747 (0.340%) in intersection ############################################################################## # cpgIslands on UNMASKED sequence (DONE - 2014-07-16 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/cpgIslandsUnmasked cd /hive/data/genomes/xenTro7/bed/cpgIslandsUnmasked time (doCpgIslands.pl -dbHost=hgwdev -bigClusterHub=ku -buildDir=`pwd` \ -tableName=cpgIslandExtUnmasked \ -maskedSeq=/hive/data/genomes/xenTro7/xenTro7.unmasked.2bit \ -workhorse=hgwdev -smallClusterHub=ku xenTro7) > do.log 2>&1 # real 13m17.561s cat fb.xenTro7.cpgIslandExtUnmasked.txt # 14815116 bases of 1365936747 (1.085%) in intersection ######################################################################### # genscan - (DONE - 2013-06-26 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/genscan cd /hive/data/genomes/xenTro7/bed/genscan time doGenscan.pl xenTro7 > do.log 2>&1 # real 105m56.579s cat fb.xenTro7.genscan.txt # 49337616 bases of 1365936747 (3.612%) in intersection cat fb.xenTro7.genscanSubopt.txt # 37929799 bases of 1365936747 (2.777%) in intersection ######################################################################### # MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2013-06-26 - Hiram) # Use -repMatch=500, based on size -- for human we use 1024 # use the "real" number from the faSize measurement, # hg19 is 2897316137, calculate the ratio factor for 1024: calc \( 1365936747 / 2897316137 \) \* 1024 # ( 1365936747 / 2897316137 ) * 1024 = 482.763759 # round up to 500 (xenTro3 used 500) cd /hive/data/genomes/xenTro7 time blat xenTro7.2bit /dev/null /dev/null -tileSize=11 \ -makeOoc=jkStuff/xenTro7.11.ooc -repMatch=500 # Wrote 31229 overused 11-mers to jkStuff/xenTro7.11.ooc # real 0m28.626s # xenTro3 had: Wrote 29991 overused 11-mers to jkStuff/xenTro3.11.ooc # there are *no* non-bridged gaps, no lift file needed for genbank hgsql -N -e "select bridge from gap;" xenTro7 | sort | uniq -c # 47422 yes # cd /hive/data/genomes/xenTro7/jkStuff # gapToLift xenTro7 xenTro7.nonBridged.lift -bedFile=xenTro7.nonBridged.bed # largest non-bridged contig: # awk '{print $3-$2,$0}' xenTro7.nonBridged.bed | sort -nr | head # 56928224 chr5 4758199 61686423 chr5.07 ######################################################################### # AUTO UPDATE GENBANK (TBD - 2013-03-08 - Pauline) # examine the file: /cluster/data/genbank/data/organism.lst # for your species to see what counts it has for: # organism mrnaCnt estCnt refSeqCnt # Xenopus (Silurana) tropicalis 18847 1271481 8894 # to decide which "native" mrna or ests you want to specify in genbank.conf ssh hgwdev cd $HOME/kent/src/hg/makeDb/genbank git pull # edit etc/genbank.conf to add xenTro7 just after ce2 # xenTro7 'Xenopus (Silurana) tropicalis' 7728 scaffolds xenTro7.serverGenome = /hive/data/genomes/xenTro7/xenTro7.2bit xenTro7.clusterGenome = /hive/data/genomes/xenTro7/xenTro7.2bit xenTro7.ooc = /hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc xenTro7.lift = no xenTro7.refseq.mrna.native.pslCDnaFilter = ${ordered.refseq.mrna.native.pslCDnaFilter} xenTro7.refseq.mrna.xeno.pslCDnaFilter = ${ordered.refseq.mrna.xeno.pslCDnaFilter} xenTro7.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter} xenTro7.genbank.mrna.xeno.pslCDnaFilter = ${ordered.genbank.mrna.xeno.pslCDnaFilter} xenTro7.genbank.est.native.pslCDnaFilter = ${ordered.genbank.est.native.pslCDnaFilter} xenTro7.refseq.mrna.native.load = yes xenTro7.genbank.est.native.load = yes xenTro7.refseq.mrna.xeno.load = no xenTro7.genbank.mrna.xeno.load = no xenTro7.downloadDir = xenTro7 xenTro7.perChromTables = no xenTro7.mgc = yes # xenTro7.upstreamGeneTbl = ensGene # xenTro7.upstreamMaf = multiz9way # /hive/data/genomes/xenTro7/bed/multiz9way/species.list # end of section added to etc/genbank.conf git commit -m "adding xenTro7 Xenopus (Silurana) tropicalis refs #9868" etc/genbank.conf git push make etc-update ssh hgwdev # used to do this on "genbank" machine screen -S xenTro7 # long running job managed in screen cd /cluster/data/genbank time ./bin/gbAlignStep -initial xenTro7 & # logFile: var/build/logs/2015.12.01-12:35:26.xenTro7.initalign.log # real 53m58.627s # load database when finished ssh hgwdev cd /cluster/data/genbank time ./bin/gbDbLoadStep -drop -initialLoad xenTro7 & # logFile: var/dbload/hgwdev/logs/2015.12.01-16:31:08.xenTro7.dbload.log # real 25m13.913s # enable daily alignment and update of hgwdev (DONE - 2013-06-30 - Hiram) cd ~/kent/src/hg/makeDb/genbank git pull # add xenTro7 to: vi etc/align.dbs etc/hgwdev.dbs git commit -m "Added xenTro7. refs #9868" etc/align.dbs etc/hgwdev.dbs git push make etc-update ######################################################################### # set default position same as xenTro3 (DONE - 2015-03-18 - Hiram) hgsql -e \ 'update dbDb set defaultPos="KB021661:77920643-77933995" where name="xenTro7";' \ hgcentraltest ######################################################################### # LIFTOVER TO xenTro7 (DONE - 2015-03-17 - Hiram) # procedure outlined in xenTro3 ######################################################################### # ucscToINSDC table/track (DONE - 2015-03-18 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/ucscToINSDC cd /hive/data/genomes/xenTro7/bed/ucscToINSDC # check for chrM in assembly: grep chrM ../../xenTro7.agp # chrM 1 17610 4 F NC_006839 1 17610 + # use the accession name from there in this command (blank if none) ~/kent/src/hg/utils/automation/ucscToINSDC.sh \ ../../genbank/Primary_Assembly NC_006839 awk '{printf "%s\t0\t%d\n", $1,$2}' ../../chrom.sizes \ | sort > name.coordinate.tab # do NOT need the v1 on these names, wasn't used originally: sed --in-place -e 's/v1//' ucscToINSDC.txt join name.coordinate.tab ucscToINSDC.txt | tr '[ ]' '[\t]' \ > ucscToINSDC.bed # should all be the same line count: wc -l * # 7728 name.coordinate.tab # 7728 ucscToINSDC.bed # 7728 ucscToINSDC.txt cut -f1 ucscToINSDC.bed | awk '{print length($0)}' | sort -n | tail -1 # 8 # use the 8 in this sed sed -e "s/21/8/" $HOME/kent/src/hg/lib/ucscToINSDC.sql \ | hgLoadSqlTab xenTro7 ucscToINSDC stdin ucscToINSDC.bed checkTableCoords xenTro7 # should cover %100 entirely: featureBits -countGaps xenTro7 ucscToINSDC # 1437530879 bases of 1437530879 (100.000%) in intersection ######################################################################### # fixup search rule for assembly track/gold table (DONE - 2014-05-01 - Hiram) hgsql -N -e "select frag from gold;" xenTro7 | sort -u \ > /tmp/xenTro7.frag.gold.txt export maxLen=`awk '{print length($0)}' /tmp/xenTro7.frag.gold.txt | sort -rn | head -1` echo "scan to column: $maxLen" export C=1 while [ $C -le $maxLen ]; do echo -n " $C: " awk '{ print substr($0,'$C',1) }' /tmp/xenTro7.frag.gold.txt | sort -u | xargs echo | sed -e 's/ //g' C=`echo $C | awk '{print $1+1}'` done 1: AN 2: AC 3: M_ 4: 0C 5: 0 6: 26 7: 08 8: 012345 9: 0123456789 10: 0123456789 11: 0123456789 12: 0123456789 13: . 14: 1 # verify this rule will find them all or eliminate them all: hgsql -N -e "select frag from gold;" xenTro7 | wc -l # 55150 hgsql -N -e "select frag from gold;" xenTro7 \ | egrep -e '[AN][AC][M_][C0]0[0-9]+(\.1)?' | wc -l # 55150 hgsql -N -e "select frag from gold;" xenTro7 \ | egrep -v -e '[AN][AC][M_][C0]0[0-9]+(\.1)?' | wc -l # 0 # hence, add to trackDb/zebrafish/xenTro7/trackDb.ra searchTable gold shortCircuit 1 termRegex [AN][AC][M_][C0]0[0-9]+(\.1)? query select chrom,chromStart,chromEnd,frag from %s where frag like '%s%%' searchPriority 8 ############################################################################ # BLATSERVERS ENTRY (DONE - 2015-03-18 - Hiram) # After getting a blat server assigned by the Blat Server Gods, ssh hgwdev hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("xenTro7", "blat4b", "17856", "1", "0"); \ INSERT INTO blatServers (db, host, port, isTrans, canPcr) \ VALUES ("xenTro7", "blat4b", "17857", "0", "1");' \ hgcentraltest # test it with some sequence ############################################################################ # downloads and pushQ entry (DONE - 2015-03-18 - Hiram) # after adding xenTro7 to the all.joiner file and verifying that # joinerCheck is clean (i.e. run joinerCheck w -times and -keys flags # to make sure there are no errors), can construct the downloads: cd /hive/data/genomes/xenTro7 time makeDownloads.pl -workhorse=hgwdev xenTro7 > downloads.log 2>&1 # real 25m29.328s mkdir /hive/data/genomes/xenTro7/pushQ cd /hive/data/genomes/xenTro7/pushQ # do not allow transMap to go out time makePushQSql.pl xenTro7 2> stderr.txt \ | grep -v transMap > xenTro7.pushQ.sql # real 6m59.942s # check the stderr.txt for bad stuff, these kinds of warnings are OK: # WARNING: hgwdev does not have /gbdb/xenTro7/wib/gc5Base.wib # WARNING: hgwdev does not have /gbdb/xenTro7/wib/quality.wib # WARNING: hgwdev does not have /gbdb/xenTro7/bbi/gc5BaseBw/gc5Base.bw # WARNING: hgwdev does not have /gbdb/xenTro7/bbi/qualityBw/quality.bw # WARNING: xenTro7 does not have seq # WARNING: xenTro7 does not have extFile # copy it to hgwbeta scp -p xenTro7.pushQ.sql qateam@hgwbeta:/tmp ssh qateam@hgwbeta "./bin/x86_64/hgsql qapushq < /tmp/xenTro7.pushQ.sql" # in that pushQ entry walk through each entry and see if the # sizes will set properly ############################################################################ # SWAP hg38/Human chain/net (DONE - 2015-02-20 - Hiram) # original alignment cd /hive/data/genomes/hg38/bed/lastzXenTro7.2015-02-18 cat fb.hg38.chainXenTro7Link.txt # 116213822 bases of 3049335806 (3.811%) in intersection # and for the swap: mkdir /hive/data/genomes/xenTro7/bed/blastz.hg38.swap cd /hive/data/genomes/xenTro7/bed/blastz.hg38.swap time (doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg38/bed/lastzXenTro7.2015-02-18/DEF \ -swap -chainMinScore=5000 -chainLinearGap=loose \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 53m28.988s cat fb.xenTro7.chainHg38Link.txt # 108823737 bases of 1365936747 (7.967%) in intersection time (doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` xenTro7 hg38) \ > rbest.log 2>&1 # real 16m4.622s ############################################################################ # SWAP hg19/Human chain/net (TBD - 2013-08-29 - Hiram) # original alignment cd /hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28 cat fb.hg19.chainXenTro7Link.txt # 91350514 bases of 2897316137 (3.153%) in intersection # and for the swap mkdir /hive/data/genomes/xenTro7/bed/blastz.hg19.swap cd /hive/data/genomes/xenTro7/bed/blastz.hg19.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ /hive/data/genomes/hg19/bed/lastzXenTro7.2013-08-28/DEF \ -workhorse=hgwdev -smallClusterHub=ku \ -bigClusterHub=ku \ -swap -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 # real 62m38.163s cat fb.xenTro7.chainHg19Link.txt # 92294714 bases of 1365936747 (6.757%) in intersection # set sym link to indicate this is the lastz for this genome: cd /hive/data/genomes/xenTro7/bed ln -s blastz.hg19.swap lastz.hg19 ############################################################################## # TransMap V3 tracks. see makeDb/doc/transMapTracks.txt (2014-12-21 markd) ############################################################################## # LIFTOVER TO xenTro2 (DONE - 2015-03-20 - Hiram) ssh hgwdev mkdir /hive/data/genomes/xenTro7/bed/blat.xenTro2.2015-03-20 cd /hive/data/genomes/xenTro7/bed/blat.xenTro2.2015-03-20 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ xenTro7 xenTro2) > do.log 2>&1 # real 181m55.750s # verify the convert link on the test browser is now active from xenTro7 to # xenTro2 ######################################################################### # LIFTOVER TO xenTro3 (DONE - 2015-03-24 - Hiram) ssh hgwdev mkdir /hive/data/genomes/xenTro7/bed/blat.xenTro3.2015-03-24 cd /hive/data/genomes/xenTro7/bed/blat.xenTro3.2015-03-24 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ xenTro7 xenTro3) > do.log 2>&1 # real 108m40.174s # verify the convert link on the test browser is now active from xenTro7 to # xenTro3 ############################################################################## # LIFTOVER TO xenTro9 (DONE - 2017-03-28 - Hiram) ssh hgwdev mkdir /hive/data/genomes/xenTro7/bed/blat.xenTro9.2017-03-28 cd /hive/data/genomes/xenTro7/bed/blat.xenTro9.2017-03-28 time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ -ooc=/hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc \ -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ xenTro7 xenTro9) > do.log 2>&1 # real 488m42.323s # verify the convert link on the test browser is now active from xenTro7 to # xenTro9 ######################################################################### +# LIFTOVER TO xenTro10 (DONE - 2021-02-23 - Hiram) + ssh hgwdev + mkdir /hive/data/genomes/xenTro7/bed/blat.xenTro10.2021-02-23 + cd /hive/data/genomes/xenTro7/bed/blat.xenTro10.2021-02-23 + doSameSpeciesLiftOver.pl -debug -verbose=2 -buildDir=`pwd` \ + -ooc=/hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + xenTro7 xenTro10 + time (doSameSpeciesLiftOver.pl -verbose=2 -buildDir=`pwd` \ + -ooc=/hive/data/genomes/xenTro7/jkStuff/xenTro7.11.ooc \ + -bigClusterHub=ku -dbHost=hgwdev -workhorse=hgwdev \ + xenTro7 xenTro10) > do.log 2>&1 + # real 474m55.654s + + # verify the convert link on the test browser is now active from xenTro7 to + # xenTro10 + +######################################################################### # Tibetan frog/nanPar1 Lastz run (DONE - 2015-03-24 - Hiram) screen -S nanPar1 # use screen to manage this long running job mkdir -p /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24/tuning cd /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24/tuning hgsql -N -e 'select * from genscan;' nanPar1 | cut -f2- \ | sort > nanPar1.genes.gp hgsql -N -e 'select * from genscan;' xenTro7 | cut -f2- \ | sort > xenTro7.genes.gp getRnaPred -peptides -genomeSeqs=/hive/data/genomes/xenTro7/xenTro7.2bit \ xenTro7 xenTro7.genes.gp all xenTro7.genes.pep getRnaPred -peptides -genomeSeqs=/hive/data/genomes/nanPar1/nanPar1.2bit \ nanPar1 nanPar1.genes.gp all nanPar1.genes.pep time (blat -prot -oneOff=1 xenTro7.genes.pep nanPar1.genes.pep \ -out=maf xenTro7.nanPar1.oneOff.maf) > blat.log 2>&1 cat << '_EOF_' > DEF # human vs sperm whale # TARGET: Human Hg19 SEQ1_DIR=/scratch/data/xenTro7/xenTro7.2bit SEQ1_LEN=/scratch/data/xenTro7/chrom.sizes SEQ1_CHUNK=10000000 SEQ1_LAP=10000 # QUERY: sperm whale PhyCat1 SEQ2_DIR=/hive/data/genomes/nanPar1/nanPar1.2bit SEQ2_LEN=/hive/data/genomes/nanPar1/chrom.sizes SEQ2_CHUNK=20000000 SEQ2_LAP=0 SEQ2_LIMIT=50 BASE=/hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24 TMPDIR=/dev/shm '_EOF_' # << emacs time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet > do.log 2>&1 # real 779m50.178s # forgot to load up nanPar1 database for net repeat classification # finish load step manually, then: cat fb.xenTro7.chainPhyCat1Link.txt # 1521042352 bases of 2897316137 (52.498%) in intersection time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \ -continue=download -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet > download.log 2>&1 # real 32m10.340s # create symLink to indicate this is the version to use cd /hive/data/genomes/xenTro7/bed ln -s lastzPhyCat1.2014-03-24 lastz.nanPar1 cd /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24 # filter with doRecipBest.pl time doRecipBest.pl -workhorse=hgwdev -buildDir=`pwd` \ xenTro7 nanPar1 > rbest.log 2>&1 & # real 59m7.123s # running the swap mkdir /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap cd /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap time nice -n +19 doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24/DEF \ -chainMinScore=3000 -chainLinearGap=medium \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet > swap.log 2>&1 # real 102m56.234s cat fb.nanPar1.chainHg19Link.txt # 1455933862 bases of 2233689186 (65.181%) in intersection cd /hive/data/genomes/nanPar1/bed ln -s blastz.xenTro7.swap lastz.xenTro7 ######################################################################### # EXPERIMENT - does default lastz parameters perform as well as the tuned # Tibetan frog/nanPar1 Lastz run (DONE - 2015-03-24 - Hiram) # the no-tuned parameters produced more coverage screen -S nanPar1 # use screen to manage this long running job mkdir -p /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24/tuning cd /hive/data/genomes/xenTro7/bed/lastzPhyCat1.2014-03-24/tuning hgsql -N -e 'select * from genscan;' nanPar1 | cut -f2- \ | sort > nanPar1.genes.gp hgsql -N -e 'select * from genscan;' xenTro7 | cut -f2- \ | sort > xenTro7.genes.gp getRnaPred -peptides -genomeSeqs=/hive/data/genomes/xenTro7/xenTro7.2bit \ xenTro7 xenTro7.genes.gp all xenTro7.genes.pep getRnaPred -peptides -genomeSeqs=/hive/data/genomes/nanPar1/nanPar1.2bit \ nanPar1 nanPar1.genes.gp all nanPar1.genes.pep time (blat -prot -oneOff=1 xenTro7.genes.pep nanPar1.genes.pep \ -out=maf xenTro7.nanPar1.oneOff.maf) > blat.log 2>&1 # Loaded 16075148 letters in 35298 sequences # Searched 17887635 bases in 47726 sequences # real 171m14.106s ~/kent/src/hg/utils/automation/lastz_D/mafScoreSizeScan.pl \ xenTro7.nanPar1.oneOff.maf > mafScoreSizeScan.list ave mafScoreSizeScan.list | grep "^Q3" | awk '{print $2}' \ | sed -e 's/.000000//' > mafScoreSizeScan.Q3 timm ~/kent/src/hg/utils/automation/lastz_D/topAll.sh xenTro7 nanPar1 # scan the four results to see if they are similar ~/kent/src/hg/utils/automation/lastz_D/matrixSummary.pl | sed -e 's/^/# /;' # read 4 .txt files tuning # A C G T averages 4 files tuning # A 100 -158 -84 -179 # C -158 72 -118 -84 # G -84 -118 72 -158 # T -179 -84 -158 100 # A C G T ranges 4 files tuning # A 0 14 2 6 # C 14 1 2 2 # G 2 2 1 14 # T 6 2 14 0 # A C G T ranges percent 4 files tuning # A 0.0 8.8 2.4 3.3 # C 8.8 1.4 1.7 2.4 # G 2.4 1.7 -0.8 8.8 # T 3.3 2.4 8.8 0.0 cat << '_EOF_' > DEF # X. tropicalis vs. Nanorana parkeri - Tibetan frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.54/bin/lastz # lastz defaults end up to be: # lastz.v1.03.54 H=2000 --format=axt+ # # hsp_threshold = 3000 # gapped_threshold = 3000 # x_drop = 910 # y_drop = 9400 # gap_open_penalty = 400 # gap_extend_penalty = 30 # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # seed=1110100110010101111 w/transition # step=1 ##matrix=axtChain 16 #91,-114,-31,-123,-114,100,-125,-31,-31,-125,100,-114,-123,-31,-114,91 ##gapPenalties=axtChain O=400 E=30 # TARGET: X. tropicalis xenTro7 SEQ1_DIR=/hive/data/genomes/xenTro7/xenTro7.2bit SEQ1_LEN=/hive/data/genomes/xenTro7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Nanorana parkeri - Tibetan frog - nanPar1 SEQ2_DIR=/hive/data/genomes/nanPar1/nanPar1.2bit SEQ2_LEN=/hive/data/genomes/nanPar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-03-24 TMPDIR=/dev/shm '_EOF_' # << emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -workhorse=hgwdev -smallClusterHub=ku \ -bigClusterHub=ku -syntenicNet) > do.log 2>&1 # real 155m16.539s cat fb.xenTro7.chainNanPar1Link.txt # 112202241 bases of 1365936747 (8.214%) in intersection # replacing results done with tuning: # 56705027 bases of 1365936747 (4.151%) in intersection time (doRecipBest.pl -buildDir=`pwd` xenTro7 nanPar1) > rbest.log 2>&1 & # running the swap mkdir /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap cd /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap time (doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-03-24/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 102m56.234s cat fb.nanPar1.chainXenTro7Link.txt # 1455933862 bases of 2233689186 (65.181%) in intersection time (doRecipBest.pl -buildDir=`pwd` nanPar1 xenTro7) > rbest.log 2>&1 ######################################################################### # Tibetan frog/nanPar1 Lastz run (DONE - 2015-06-10 - Hiram) mkdir /hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-06-10 cd /hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-06-10 cat << '_EOF_' > DEF # X. tropicalis vs. Nanorana parkeri - Tibetan frog BLASTZ=/cluster/bin/penn/lastz-distrib-1.03.66/bin/lastz # lastz defaults end up to be: # lastz.v1.03.54 H=2000 --format=axt+ # # hsp_threshold = 3000 # gapped_threshold = 3000 # x_drop = 910 # y_drop = 9400 # gap_open_penalty = 400 # gap_extend_penalty = 30 # A C G T # A 91 -114 -31 -123 # C -114 100 -125 -31 # G -31 -125 100 -114 # T -123 -31 -114 91 # seed=1110100110010101111 w/transition # step=1 ##matrix=axtChain 16 91,-114,-31,-123,-114,100,-125,-31,-31,-125,100,-114,-123,-31,-114,91 ##gapPenalties=axtChain O=400 E=30 # TARGET: X. tropicalis xenTro7 SEQ1_DIR=/hive/data/genomes/xenTro7/xenTro7.2bit SEQ1_LEN=/hive/data/genomes/xenTro7/chrom.sizes SEQ1_CHUNK=20000000 SEQ1_LAP=10000 # QUERY: Nanorana parkeri - Tibetan frog - nanPar1 SEQ2_DIR=/hive/data/genomes/nanPar1/nanPar1.2bit SEQ2_LEN=/hive/data/genomes/nanPar1/chrom.sizes SEQ2_CHUNK=10000000 SEQ2_LIMIT=200 SEQ2_LAP=0 BASE=/hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-06-10 TMPDIR=/dev/shm '_EOF_' # << emacs time (doBlastzChainNet.pl `pwd`/DEF -verbose=2 \ -workhorse=hgwdev -smallClusterHub=ku \ -bigClusterHub=ku -syntenicNet) > do.log 2>&1 # real 116m13.134s cat fb.xenTro7.chainNanPar1Link.txt # 112202241 bases of 1365936747 (8.214%) in intersection time (doRecipBest.pl -buildDir=`pwd` xenTro7 nanPar1) > rbest.log 2>&1 & # real 4m57.676s # running the swap mkdir /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap cd /hive/data/genomes/nanPar1/bed/blastz.xenTro7.swap time (doBlastzChainNet.pl -verbose=2 \ -swap /hive/data/genomes/xenTro7/bed/lastzNanPar1.2015-06-10/DEF \ -workhorse=hgwdev -smallClusterHub=ku -bigClusterHub=ku \ -syntenicNet) > swap.log 2>&1 # real 22m50.311s cat fb.nanPar1.chainXenTro7Link.txt # 121183837 bases of 1977771384 (6.127%) in intersection time (doRecipBest.pl -buildDir=`pwd` nanPar1 xenTro7) > rbest.log 2>&1 # real 6m53.584s #########################################################################