b3833e993f98922e22e1278e632716e729ec54ad
hiram
Fri Jul 19 15:24:34 2019 -0700
adding gapOverlap and tandemDups tracks to the asmHub build refs #23734
diff --git src/hg/utils/automation/asmHubTanDups.pl src/hg/utils/automation/asmHubTanDups.pl
new file mode 100755
index 0000000..642d5de
--- /dev/null
+++ src/hg/utils/automation/asmHubTanDups.pl
@@ -0,0 +1,155 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+use FindBin qw($Bin);
+use lib "$Bin";
+use AsmHub;
+use File::Basename;
+
+my $argc = scalar(@ARGV);
+
+if ($argc != 3) {
+ printf STDERR "usage: asmHubTanDups.pl asmId asmId.names.tab .../trackData/\n";
+ printf STDERR "where asmId is the assembly identifier,\n";
+ printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n";
+ exit 255;
+}
+
+my $asmId = shift;
+my $namesFile = shift;
+my $trackDataDir = shift;
+my $gapOverlapBbi = "$trackDataDir/gapOverlap/$asmId.gapOverlap.bb";
+my $tandemDupsBbi = "$trackDataDir/tandemDups/$asmId.tandemDups.bb";
+
+if ( ! (-s $gapOverlapBbi || -s $tandemDupsBbi) ) {
+ printf STDERR "ERROR: can not find gapOverlap or tandemDups bbi files:\n\t'%s'\n\t%s\n", $gapOverlapBbi, $tandemDupsBbi;
+ exit 255;
+}
+
+my $em = "";
+my $noEm = "";
+my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`;
+chomp $assemblyDate;
+my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`;
+chomp $ncbiAssemblyId;
+my $organism = `grep -v "^#" $namesFile | cut -f5`;
+chomp $organism;
+
+my $gapOverlapItemCount = "
+This track indicates any pair of exactly identical sequence
+for the $assemblyDate $em${organism}$noEm/$asmId genome assembly.
+
+There may be two tracks in this composite collection:
+Description
+
+
+The Gap Overlaps is thus a subset of the full Tandem Dups track.
+
+This investigation began when an unusual number of paired sequences +around gaps was noticed during the mouse strain sequencing project. +This naturally raised the question, how common is this feature, and what +type of assemblies can it be found in. +
+ +
+The Gap Overlaps track indicates any pair of exactly identical sequence
+on each side of gaps. Where a gap is any run of N's, including
+a single N. The end of an upstream sequence before the gap
+is duplicated exactly at the beginning of the downstream sequence
+following the gap in the assembly.
Data in track: $gapOverlapItemCount.
+
+The Tandem Dups track is a similar survey over the entire genome
+assembly. The separation gap between these paired sequences
+can range from 1 base up to 20,000 bases.
Data in track: $tandemDupsItemCount.
+
+The Gap Overlap duplicate sequences were found by +extracting 1,000 bases before and after each gap and aligned to +each other with the blat command: +
+ blat -q=dna -minIdentity=95 -repMatch=10 upstreamContig.fa downstreamContig.fa ++Filtering the PSL output for a perfect match, no mis-matches, +and therefore of equal size matching sequence, +where the alignment ends exactly at the end of the upstream sequence, +and begins exactly at the start of the downstream sequence. + +
+The Tandem Dups paired sequences were found with the following procedure: +
+The reason for starting with 29 base sized pairs and then selecting +results of at least 30 base sized pairs results in a reasonable +number of 30 base pairs. If the procedure starts with 30 base +sized pairs, it produces way too many 30 base kmer pairs for +a reasonable count. +
++
+Thank you to Joel Armstrong and Benedict Paten of the + +Computational Genomics Lab +at the + +U.C. Santa Cruz Genomics Institute +for identifying this characteristic of genome assemblies. +
+ +
The data and presentation of this track were prepared by +Hiram Clawson, + +U.C. Santa Cruz Genomics Institute +
+_EOF_ + ;