4bd23630e7a8c9a909c69900c8b67b992a993e91 hiram Mon Jan 13 13:15:52 2020 -0800 two new categories of gap types to count, repeat and contamination refs #24748 diff --git src/hg/utils/automation/asmHubGap.pl src/hg/utils/automation/asmHubGap.pl index ae9ff12..1a12b96 100755 --- src/hg/utils/automation/asmHubGap.pl +++ src/hg/utils/automation/asmHubGap.pl @@ -26,30 +26,32 @@ if ( ! -s $agpFile ) { printf STDERR "ERROR: can not find AGP file:\n\t'%s'\n", $agpFile; exit 255; } # definition of gap types in the AGP file my %gapTypes = ( 'clone' => 'gaps between clones in scaffolds', 'heterochromatin' => 'heterochromatin gaps', 'short_arm' => 'short arm gaps', 'telomere' => 'telomere gaps', 'centromere' => 'gaps for centromeres are included when they can be reasonably localized', 'scaffold' => 'gaps between scaffolds in chromosome assemblies', 'contig' => 'gaps between contigs in scaffolds', +'repeat' => 'an unresolvable repeat', +'contamination' => 'gap inserted in place of foreign sequence to maintain the coordinates', 'other' => 'gaps added at UCSC to annotate strings of <em>N</em>s that were not marked in the AGP file', 'fragment' => 'gaps between whole genome shotgun contigs' ); my $em = "<em>"; my $noEm = "</em>"; my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`; chomp $assemblyDate; my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`; chomp $ncbiAssemblyId; my $organism = `grep -v "^#" $namesFile | cut -f5`; chomp $organism; my $gapCount = `zcat $agpFile | grep -v "^#" | awk -F'\t' '\$5 == "N"' | wc -l`; chomp $gapCount; $gapCount = &AsmHub::commify($gapCount);