4bd23630e7a8c9a909c69900c8b67b992a993e91 hiram Mon Jan 13 13:15:52 2020 -0800 two new categories of gap types to count, repeat and contamination refs #24748 diff --git src/hg/utils/automation/asmHubGap.pl src/hg/utils/automation/asmHubGap.pl index ae9ff12..1a12b96 100755 --- src/hg/utils/automation/asmHubGap.pl +++ src/hg/utils/automation/asmHubGap.pl @@ -1,114 +1,116 @@ #!/usr/bin/env perl use strict; use warnings; use FindBin qw($Bin); use lib "$Bin"; use AsmHub; use File::Basename; my $argc = scalar(@ARGV); if ($argc != 4) { printf STDERR "usage: asmHubGap.pl asmId asmId.names.tab asmId.agp.gz hubUrl > asmId.gap.html\n"; printf STDERR "where asmId is the assembly identifier,\n"; printf STDERR "and asmId.names.tab is naming file for this assembly,\n"; printf STDERR "and asmId.agp.gz is AGP file for this assembly,\n"; printf STDERR "and hubUrl is the URL to this assembly directory for the AGP file.\n"; exit 255; } my $asmId = shift; my $namesFile = shift; my $agpFile = shift; my $hubUrl = shift; my $agpFileBase = basename($agpFile); if ( ! -s $agpFile ) { printf STDERR "ERROR: can not find AGP file:\n\t'%s'\n", $agpFile; exit 255; } # definition of gap types in the AGP file my %gapTypes = ( 'clone' => 'gaps between clones in scaffolds', 'heterochromatin' => 'heterochromatin gaps', 'short_arm' => 'short arm gaps', 'telomere' => 'telomere gaps', 'centromere' => 'gaps for centromeres are included when they can be reasonably localized', 'scaffold' => 'gaps between scaffolds in chromosome assemblies', 'contig' => 'gaps between contigs in scaffolds', +'repeat' => 'an unresolvable repeat', +'contamination' => 'gap inserted in place of foreign sequence to maintain the coordinates', 'other' => 'gaps added at UCSC to annotate strings of <em>N</em>s that were not marked in the AGP file', 'fragment' => 'gaps between whole genome shotgun contigs' ); my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; my $gapCount = `zcat $agpFile | grep -v "^#" | awk -F'\t' '\$5 == "N"' | wc -l`; chomp $gapCount; $gapCount = &AsmHub::commify($gapCount); print <<_EOF_ <h2>Description</h2> <p> This track shows the gaps in the $assemblyDate $em${organism}$noEm/$asmId genome assembly. </p> <p> Genome assembly procedures are covered in the NCBI <a href="https://www.ncbi.nlm.nih.gov/assembly/basics/" target=_blank>assembly documentation</a>.<BR> NCBI also provides <a href="https://www.ncbi.nlm.nih.gov/assembly/$ncbiAssemblyId" target="_blank">specific information about this assembly</a>. </p> <p> The definition of the gaps in this assembly is from the AGP file: <a href="$hubUrl/$agpFileBase" target=_blank>$asmId.agp.gz</a><br> The NCBI document <a href="https://www.ncbi.nlm.nih.gov/assembly/agp/AGP_Specification/" target=_blank>AGP Specification</a> describes the format of the AGP file. </p> <p> Gaps are represented as black boxes in this track. If the relative order and orientation of the contigs on either side of the gap is supported by read pair data, it is a <em>bridged</em> gap and a white line is drawn through the black box representing the gap. </p> <p> This assembly has $gapCount gaps, with the following principal types of gaps: <ul> _EOF_ ; open (GL, "zcat $agpFile | grep -v '^#' | awk -F'\t' '\$5 == \"N\"' | awk '{print \$7}' | sort | uniq -c | sort -n|") or die "can not read $asmId.agp.gz"; while (my $line = <GL>) { chomp $line; $line =~ s/^\s+//; my ($count, $type) = split('\s+', $line); my $minSize = `zcat $agpFile | grep -v "^#" | awk -F'\t' '\$5 == "N"' | awk -F'\t' '\$7 == "'$type'"' | sort -k6,6n | head -1 | awk -F'\t' '{print \$6}'`; chomp $minSize; my $maxSize = `zcat $agpFile | grep -v "^#" | awk -F'\t' '\$5 == "N"' | awk -F'\t' '\$7 == "'$type'"' | sort -k6,6nr | head -1 | awk -F'\t' '{print \$6}'`; chomp $maxSize; my $sizeMessage = ""; if ($minSize == $maxSize) { $sizeMessage = sprintf ("all of size %s bases", &AsmHub::commify($minSize)); } else { $sizeMessage = sprintf ("size range: %s - %s bases", &AsmHub::commify($minSize), &AsmHub::commify($maxSize)); } if (exists ($gapTypes{$type}) ) { printf "<li><B>%s</B> - %s (count: %s; %s)</li>\n", $type, $gapTypes{$type}, &AsmHub::commify($count), $sizeMessage; } else { die "asmHubGap.pl: missing AGP gap type definition: $type"; } } close (GL); printf "</ul></p>\n";