b3833e993f98922e22e1278e632716e729ec54ad hiram Fri Jul 19 15:24:34 2019 -0700 adding gapOverlap and tandemDups tracks to the asmHub build refs #23734 diff --git src/hg/utils/automation/asmHubTanDups.pl src/hg/utils/automation/asmHubTanDups.pl new file mode 100755 index 0000000..642d5de --- /dev/null +++ src/hg/utils/automation/asmHubTanDups.pl @@ -0,0 +1,155 @@ +#!/usr/bin/env perl + +use strict; +use warnings; +use FindBin qw($Bin); +use lib "$Bin"; +use AsmHub; +use File::Basename; + +my $argc = scalar(@ARGV); + +if ($argc != 3) { + printf STDERR "usage: asmHubTanDups.pl asmId asmId.names.tab .../trackData/\n"; + printf STDERR "where asmId is the assembly identifier,\n"; + printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; + exit 255; +} + +my $asmId = shift; +my $namesFile = shift; +my $trackDataDir = shift; +my $gapOverlapBbi = "$trackDataDir/gapOverlap/$asmId.gapOverlap.bb"; +my $tandemDupsBbi = "$trackDataDir/tandemDups/$asmId.tandemDups.bb"; + +if ( ! (-s $gapOverlapBbi || -s $tandemDupsBbi) ) { + printf STDERR "ERROR: can not find gapOverlap or tandemDups bbi files:\n\t'%s'\n\t%s\n", $gapOverlapBbi, $tandemDupsBbi; + exit 255; +} + +my $em = ""; +my $noEm = ""; +my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; +chomp $assemblyDate; +my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; +chomp $ncbiAssemblyId; +my $organism = `grep -v "^#" $namesFile | cut -f5`; +chomp $organism; + +my $gapOverlapItemCount = ""; +my $tandemDupsItemCount = ""; + +if ( -s $gapOverlapBbi ) { + $gapOverlapItemCount = `bigBedInfo $gapOverlapBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; + chomp $gapOverlapItemCount; +} + +if ( -s $tandemDupsBbi ) { + $tandemDupsItemCount = `bigBedInfo $tandemDupsBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; + chomp $tandemDupsItemCount; +} + +print <<_EOF_ +

Description

+

+This track indicates any pair of exactly identical sequence +for the $assemblyDate $em${organism}$noEm/$asmId genome assembly. +

+ +

+There may be two tracks in this composite collection: +

+The Gap Overlaps is thus a subset of the full Tandem Dups track. +

+

+This investigation began when an unusual number of paired sequences +around gaps was noticed during the mouse strain sequencing project. +This naturally raised the question, how common is this feature, and what +type of assemblies can it be found in. +

+ +

+The Gap Overlaps track indicates any pair of exactly identical sequence +on each side of gaps. Where a gap is any run of N's, including +a single N. The end of an upstream sequence before the gap +is duplicated exactly at the beginning of the downstream sequence +following the gap in the assembly.
Data in track: $gapOverlapItemCount. +

+

+The Tandem Dups track is a similar survey over the entire genome +assembly. The separation gap between these paired sequences +can range from 1 base up to 20,000 bases.
Data in track: $tandemDupsItemCount. +

+ +

Methods

+

+The Gap Overlap duplicate sequences were found by +extracting 1,000 bases before and after each gap and aligned to +each other with the blat command: +

+  blat -q=dna -minIdentity=95 -repMatch=10 upstreamContig.fa downstreamContig.fa
+
+Filtering the PSL output for a perfect match, no mis-matches, +and therefore of equal size matching sequence, +where the alignment ends exactly at the end of the upstream sequence, +and begins exactly at the start of the downstream sequence. +

+

+The Tandem Dups paired sequences were found with the following procedure: +

+

+

+The reason for starting with 29 base sized pairs and then selecting +results of at least 30 base sized pairs results in a reasonable +number of 30 base pairs. If the procedure starts with 30 base +sized pairs, it produces way too many 30 base kmer pairs for +a reasonable count. +

+

+

See Also

+Interactive tables of all results: + +

+

Credits

+

+Thank you to Joel Armstrong and Benedict Paten of the + +Computational Genomics Lab +at the + +U.C. Santa Cruz Genomics Institute +for identifying this characteristic of genome assemblies. +

+ +

The data and presentation of this track were prepared by +Hiram Clawson, + +U.C. Santa Cruz Genomics Institute +

+_EOF_ + ;