c05cfd40266e98367a883cd3737959533cef01cd
angie
  Fri Feb 5 09:57:03 2021 -0800
Initial work for mm10 patches: create db grcM38P6 with patch sequences and annotations.  refs #25045

diff --git src/hg/makeDb/doc/mm10.scanAssemblyReport.pl src/hg/makeDb/doc/mm10.scanAssemblyReport.pl
new file mode 100755
index 0000000..d3926d8
--- /dev/null
+++ src/hg/makeDb/doc/mm10.scanAssemblyReport.pl
@@ -0,0 +1,99 @@
+#!/usr/bin/env perl
+
+use strict;
+use warnings;
+
+sub usage() {
+  printf STDERR "usage: scanAssemblyReport.pl <chrom.sizes> <faCount.GRCH38.p2.txt> <GCA_000001405.17_GRCh38.p2_assembly_report.txt>\n";
+}
+
+my $argc = scalar(@ARGV);
+if ($argc != 3) {
+  usage;
+  exit 255;
+}
+
+my $chromSizes = shift;
+my $faCount = shift;
+my $asmReport = shift;
+
+my %chrSize;
+
+open (FH, "<$chromSizes") or die "can not read $chromSizes";
+while (my $line = <FH>) {
+  chomp $line;
+  my ($chr, $size) = split('\t', $line);
+  $chrSize{$chr} = $size;
+}
+close (FH);
+
+my %patchSize;
+open (FH, "<$faCount") or die "can not read $faCount";
+while (my $line = <FH>) {
+  next if ($line =~ m/^#seq|^total/);
+  chomp $line;
+  my ($chr, $size, $rest) = split('\s+', $line, 3);
+  $patchSize{$chr} = $size;
+}
+close (FH);
+
+# #seq	len	A	C	G	T	N	cpg
+# CM000663.2	248956422	67070277	48055043	48111528	67244164	18475410	2375159
+# KI270706.1	175055	50401	37708	35908	51038	0	1589
+# KI270707.1	32032	10439	6781	6926	7886	0	311
+# KI270932.1	215732	56283	52137	48354	58958	0	2720
+# KI270933.1	170537	44612	40866	38012	47047	0	2095
+# GL000209.2	177381	46045	42743	39757	48836	0	2193
+# J01415.2	16569	5124	5181	2169	4094	1	435
+# total	3221487035	901716923	626373718	628977988	904389966	160028440	31134771
+
+open (FH, "<$asmReport") or die "can not read $asmReport";
+while (my $line = <FH>) {
+   next if ($line =~ m/^#/);
+   chomp $line;
+   my @a = split('\t', $line);
+   my $chrN = $a[2];
+   $chrN = "M" if ($chrN eq "MT");
+   my $ucscName = "chr${chrN}";
+   my $accession = $a[4];
+   my $patchSize = $patchSize{$accession};
+   $accession =~ s/\.\d+//;
+   if ($a[1] =~ m/alt-scaffold/) { $ucscName = "chr${chrN}_${accession}_alt"; }
+   elsif ($a[1] =~ m/unlocalized-scaffold/) { $ucscName = "chr${chrN}_${accession}_random"; }
+   elsif ($a[1] =~ m/assembled-molecule/) { $ucscName = "chr${chrN}"; }
+   elsif ($a[1] =~ m/unplaced-scaffold/) { $ucscName = "chrUn_${accession}"; }
+   elsif ($a[1] =~ m/fix-patch/) { $ucscName = "chr${chrN}_${accession}_fix"; }
+   elsif ($a[1] =~ m/novel-patch/) { $ucscName = "chr${chrN}_${accession}_alt"; }
+   else { die "do not recognize type: '$a[1]'";
+   }
+   my $warnings = "";
+   if (length($ucscName) > 31) { $warnings .= "\t# warning size"; }
+   if (exists($chrSize{$ucscName})) {
+     if ($patchSize != $chrSize{$ucscName}) { $warnings .= "\t# bad size"; }
+     printf "%s\t%d\t%s%s\n", $ucscName, $chrSize{$ucscName}, $a[4], $warnings;
+   } else {
+     printf "%s\t%d\t%s\tnew%s\n", $ucscName, $patchSize, $a[4], $warnings;
+   }
+}
+close (FH);
+
+__END__
+HG986_PATCH	fix-patch	1	Chromosome	KN196472.1	=	NW_009646194.1	PATCHES
+
+HSCHR11_CTG1_UNLOCALIZED	unlocalized-scaffold	11	Chromosome	KI270721.1	=	NT_187376.1	Primary Assembly
+
+     4 novel-patch
+     25 assembled-molecule
+     27 fix-patch
+     42 unlocalized-scaffold
+    127 unplaced-scaffold
+    261 alt-scaffold
+
+
+HSCHRUN_RANDOM_170       unplaced-scaffold       na      na      KI270335.1      =       NT_187462.1     Primary Assembly
+
+LRC_KIR 19      54025634       55084318 alt-scaffold    GL949753.2      NW_003571061.2  ALT_REF_LOCI_8
+
+Y	assembled-molecule	Y	Chromosome	CM000686.2	=	NC_000024.10	Primary Assembly
+HSCHR1_CTG1_UNLOCALIZED	unlocalized-scaffold	1	Chromosome	KI270706.1	=	NT_187361.1	Primary Assembly
+HSCHR1_CTG2_UNLOCALIZED	unlocalized-scaffold	1	Chromosome	KI270707.1	=	NT_187362.1	Primary Assembly