c05cfd40266e98367a883cd3737959533cef01cd angie Fri Feb 5 09:57:03 2021 -0800 Initial work for mm10 patches: create db grcM38P6 with patch sequences and annotations. refs #25045 diff --git src/hg/makeDb/doc/mm10.scanAssemblyReport.pl src/hg/makeDb/doc/mm10.scanAssemblyReport.pl new file mode 100755 index 0000000..d3926d8 --- /dev/null +++ src/hg/makeDb/doc/mm10.scanAssemblyReport.pl @@ -0,0 +1,99 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +sub usage() { + printf STDERR "usage: scanAssemblyReport.pl <chrom.sizes> <faCount.GRCH38.p2.txt> <GCA_000001405.17_GRCh38.p2_assembly_report.txt>\n"; +} + +my $argc = scalar(@ARGV); +if ($argc != 3) { + usage; + exit 255; +} + +my $chromSizes = shift; +my $faCount = shift; +my $asmReport = shift; + +my %chrSize; + +open (FH, "<$chromSizes") or die "can not read $chromSizes"; +while (my $line = <FH>) { + chomp $line; + my ($chr, $size) = split('\t', $line); + $chrSize{$chr} = $size; +} +close (FH); + +my %patchSize; +open (FH, "<$faCount") or die "can not read $faCount"; +while (my $line = <FH>) { + next if ($line =~ m/^#seq|^total/); + chomp $line; + my ($chr, $size, $rest) = split('\s+', $line, 3); + $patchSize{$chr} = $size; +} +close (FH); + +# #seq len A C G T N cpg +# CM000663.2 248956422 67070277 48055043 48111528 67244164 18475410 2375159 +# KI270706.1 175055 50401 37708 35908 51038 0 1589 +# KI270707.1 32032 10439 6781 6926 7886 0 311 +# KI270932.1 215732 56283 52137 48354 58958 0 2720 +# KI270933.1 170537 44612 40866 38012 47047 0 2095 +# GL000209.2 177381 46045 42743 39757 48836 0 2193 +# J01415.2 16569 5124 5181 2169 4094 1 435 +# total 3221487035 901716923 626373718 628977988 904389966 160028440 31134771 + +open (FH, "<$asmReport") or die "can not read $asmReport"; +while (my $line = <FH>) { + next if ($line =~ m/^#/); + chomp $line; + my @a = split('\t', $line); + my $chrN = $a[2]; + $chrN = "M" if ($chrN eq "MT"); + my $ucscName = "chr${chrN}"; + my $accession = $a[4]; + my $patchSize = $patchSize{$accession}; + $accession =~ s/\.\d+//; + if ($a[1] =~ m/alt-scaffold/) { $ucscName = "chr${chrN}_${accession}_alt"; } + elsif ($a[1] =~ m/unlocalized-scaffold/) { $ucscName = "chr${chrN}_${accession}_random"; } + elsif ($a[1] =~ m/assembled-molecule/) { $ucscName = "chr${chrN}"; } + elsif ($a[1] =~ m/unplaced-scaffold/) { $ucscName = "chrUn_${accession}"; } + elsif ($a[1] =~ m/fix-patch/) { $ucscName = "chr${chrN}_${accession}_fix"; } + elsif ($a[1] =~ m/novel-patch/) { $ucscName = "chr${chrN}_${accession}_alt"; } + else { die "do not recognize type: '$a[1]'"; + } + my $warnings = ""; + if (length($ucscName) > 31) { $warnings .= "\t# warning size"; } + if (exists($chrSize{$ucscName})) { + if ($patchSize != $chrSize{$ucscName}) { $warnings .= "\t# bad size"; } + printf "%s\t%d\t%s%s\n", $ucscName, $chrSize{$ucscName}, $a[4], $warnings; + } else { + printf "%s\t%d\t%s\tnew%s\n", $ucscName, $patchSize, $a[4], $warnings; + } +} +close (FH); + +__END__ +HG986_PATCH fix-patch 1 Chromosome KN196472.1 = NW_009646194.1 PATCHES + +HSCHR11_CTG1_UNLOCALIZED unlocalized-scaffold 11 Chromosome KI270721.1 = NT_187376.1 Primary Assembly + + 4 novel-patch + 25 assembled-molecule + 27 fix-patch + 42 unlocalized-scaffold + 127 unplaced-scaffold + 261 alt-scaffold + + +HSCHRUN_RANDOM_170 unplaced-scaffold na na KI270335.1 = NT_187462.1 Primary Assembly + +LRC_KIR 19 54025634 55084318 alt-scaffold GL949753.2 NW_003571061.2 ALT_REF_LOCI_8 + +Y assembled-molecule Y Chromosome CM000686.2 = NC_000024.10 Primary Assembly +HSCHR1_CTG1_UNLOCALIZED unlocalized-scaffold 1 Chromosome KI270706.1 = NT_187361.1 Primary Assembly +HSCHR1_CTG2_UNLOCALIZED unlocalized-scaffold 1 Chromosome KI270707.1 = NT_187362.1 Primary Assembly