bdc5df573ffaa879405a4e13f10c205fed01c4d1 hiram Fri May 1 12:55:06 2026 -0700 explicit /cluster/bin/ path names for kent binaries to allow these scripts to function in cron jobs refs #31811 diff --git src/hg/utils/automation/asmHubTanDups.pl src/hg/utils/automation/asmHubTanDups.pl index 642d5de4cbd..e422268dc7f 100755 --- src/hg/utils/automation/asmHubTanDups.pl +++ src/hg/utils/automation/asmHubTanDups.pl @@ -1,155 +1,155 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 3) { printf STDERR "usage: asmHubTanDups.pl asmId asmId.names.tab .../trackData/\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and .../trackData/ is the path to the /trackData/ directory.\n"; exit 255; } my $asmId = shift; my $namesFile = shift; my $trackDataDir = shift; my $gapOverlapBbi = "$trackDataDir/gapOverlap/$asmId.gapOverlap.bb"; my $tandemDupsBbi = "$trackDataDir/tandemDups/$asmId.tandemDups.bb"; if ( ! (-s $gapOverlapBbi || -s $tandemDupsBbi) ) { printf STDERR "ERROR: can not find gapOverlap or tandemDups bbi files:\n\t'%s'\n\t%s\n", $gapOverlapBbi, $tandemDupsBbi; exit 255; } my $em = ""; my $noEm = ""; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; my $gapOverlapItemCount = ""; my $tandemDupsItemCount = ""; if ( -s $gapOverlapBbi ) { - $gapOverlapItemCount = `bigBedInfo $gapOverlapBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; + $gapOverlapItemCount = `/cluster/bin/x86_64/bigBedInfo $gapOverlapBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; chomp $gapOverlapItemCount; } if ( -s $tandemDupsBbi ) { - $tandemDupsItemCount = `bigBedInfo $tandemDupsBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; + $tandemDupsItemCount = `/cluster/bin/x86_64/bigBedInfo $tandemDupsBbi | egrep "itemCount:|basesCovered:" | xargs echo | sed -e 's/itemCount/Item count/; s/ basesCovered/; Bases covered/;'`; chomp $tandemDupsItemCount; } print <<_EOF_

Description

This track indicates any pair of exactly identical sequence for the $assemblyDate $em${organism}$noEm/$asmId genome assembly.

There may be two tracks in this composite collection:

The Gap Overlaps is thus a subset of the full Tandem Dups track.

This investigation began when an unusual number of paired sequences around gaps was noticed during the mouse strain sequencing project. This naturally raised the question, how common is this feature, and what type of assemblies can it be found in.

The Gap Overlaps track indicates any pair of exactly identical sequence on each side of gaps. Where a gap is any run of N's, including a single N. The end of an upstream sequence before the gap is duplicated exactly at the beginning of the downstream sequence following the gap in the assembly.
Data in track: $gapOverlapItemCount.

The Tandem Dups track is a similar survey over the entire genome assembly. The separation gap between these paired sequences can range from 1 base up to 20,000 bases.
Data in track: $tandemDupsItemCount.

Methods

The Gap Overlap duplicate sequences were found by extracting 1,000 bases before and after each gap and aligned to each other with the blat command:

   blat -q=dna -minIdentity=95 -repMatch=10 upstreamContig.fa downstreamContig.fa
 
Filtering the PSL output for a perfect match, no mis-matches, and therefore of equal size matching sequence, where the alignment ends exactly at the end of the upstream sequence, and begins exactly at the start of the downstream sequence.

The Tandem Dups paired sequences were found with the following procedure:

The reason for starting with 29 base sized pairs and then selecting results of at least 30 base sized pairs results in a reasonable number of 30 base pairs. If the procedure starts with 30 base sized pairs, it produces way too many 30 base kmer pairs for a reasonable count.

See Also

Interactive tables of all results:

Credits

Thank you to Joel Armstrong and Benedict Paten of the Computational Genomics Lab at the U.C. Santa Cruz Genomics Institute for identifying this characteristic of genome assemblies.

The data and presentation of this track were prepared by Hiram Clawson, U.C. Santa Cruz Genomics Institute

_EOF_ ;