4bd23630e7a8c9a909c69900c8b67b992a993e91
hiram
Mon Jan 13 13:15:52 2020 -0800
two new categories of gap types to count, repeat and contamination refs #24748
diff --git src/hg/utils/automation/asmHubGap.pl src/hg/utils/automation/asmHubGap.pl
index ae9ff12..1a12b96 100755
--- src/hg/utils/automation/asmHubGap.pl
+++ src/hg/utils/automation/asmHubGap.pl
@@ -26,30 +26,32 @@
if ( ! -s $agpFile ) {
printf STDERR "ERROR: can not find AGP file:\n\t'%s'\n", $agpFile;
exit 255;
}
# definition of gap types in the AGP file
my %gapTypes = (
'clone' => 'gaps between clones in scaffolds',
'heterochromatin' => 'heterochromatin gaps',
'short_arm' => 'short arm gaps',
'telomere' => 'telomere gaps',
'centromere' => 'gaps for centromeres are included when they can be reasonably localized',
'scaffold' => 'gaps between scaffolds in chromosome assemblies',
'contig' => 'gaps between contigs in scaffolds',
+'repeat' => 'an unresolvable repeat',
+'contamination' => 'gap inserted in place of foreign sequence to maintain the coordinates',
'other' => 'gaps added at UCSC to annotate strings of Ns that were not marked in the AGP file',
'fragment' => 'gaps between whole genome shotgun contigs'
);
my $em = "";
my $noEm = "";
my $assemblyDate = `grep -v "^#" $namesFile | cut -f9`;
chomp $assemblyDate;
my $ncbiAssemblyId = `grep -v "^#" $namesFile | cut -f10`;
chomp $ncbiAssemblyId;
my $organism = `grep -v "^#" $namesFile | cut -f5`;
chomp $organism;
my $gapCount = `zcat $agpFile | grep -v "^#" | awk -F'\t' '\$5 == "N"' | wc -l`;
chomp $gapCount;
$gapCount = &AsmHub::commify($gapCount);