633cc56e1b1043b0ab6e1fd0e9b90154e2c03c0c hiram Mon Apr 20 13:00:51 2026 -0700 document procedure for import of VGP 577-way maf result refs #34370 diff --git src/hg/makeDb/doc/vgp577way/mafCoverage.pl src/hg/makeDb/doc/vgp577way/mafCoverage.pl new file mode 100755 index 00000000000..cdd98ef3778 --- /dev/null +++ src/hg/makeDb/doc/vgp577way/mafCoverage.pl @@ -0,0 +1,185 @@ +#!/usr/bin/env perl + +use strict; +use warnings; + +my $argc = scalar(@ARGV); + +if ($argc != 2) { + printf STDERR "usage: mafCoverage.pl referenceDb file.maf\n"; + printf STDERR "measures the amount of the reference genome sequence\n"; + printf STDERR "that is 'covered', or matched to query genome sequence\n"; + printf STDERR "For each block in the input maf file, count the amount\n"; + printf STDERR "of sequence for the reference sequence in this block\n"; + printf STDERR "that has any type of corresponding sequence in each\n"; + printf STDERR "query sequence. A 'coverage' measurement is thereby\n"; + printf STDERR "calculated for each query sequence.\n"; + exit 255; +} + +sub vmPeak() { + my $pid = $$; + my $vmPeak = `grep -m 1 -i vmpeak /proc/$pid/status`; + chomp $vmPeak; + printf STDERR "# %s\n", $vmPeak; +} + +my $refDb = shift; +my $mafFile = shift; +my $blockCount = 0; +my $totalBlockSize = 0; +my $totalInsertSize = 0; + +if ($mafFile =~ m/.gz$/) { + open (MF, "zcat $mafFile|") or die "can not zcat $mafFile"; +} else { + open (MF, "<$mafFile") or die "can not read $mafFile"; +} + +my %refSummary; # key is chrom name for the reference sequence + # value is a hash pointer with keys: + # 'refSize' value is size of this reference chromosome + # 'query assembly name' value is a hash pointer with key: + # 'query sequence name' value is a hash pointer with keys: + # 'querySize' value is size of this query chromosome + # 'matched' value is amount of reference matched by this query + # 'misMatched' value is amount of mis-matches, query or target +my @refNTs; +my $srcChr = ""; + +while (my $line = <MF>) { + chomp $line; + if ($line =~ m/^s ${refDb}./) { + my (undef, $refSrc, $refStart, $refSize, $refStrand, $refChrSize, $refText) = split('\s+', $line); + if ($refSrc =~ m/^GC/) { + my @a = split(/\./, $refSrc); + $srcChr = join(".", @a[2..$#a]); + } else { + (undef, $srcChr) = split(/\./, $refSrc, 2); + } +# printf STDERR "# %s -> %s\n", $refSrc, $srcChr; + if (!defined($refSummary{$srcChr})) { + my %h; + $h{'refSize'} = $refChrSize; + $refSummary{$srcChr} = \%h; + } + my $refPtr = $refSummary{$srcChr}; + if ($refPtr->{'refSize'} != $refChrSize) { + printf "ERROR: inconsistent reference chr size, was $refPtr->{'refSize'} vs. now $refChrSize\n"; + exit 255; + } + ++$blockCount; +# printf STDERR "# %s.%s\t%d\n", $refDb, $srcChr, $refPtr->{'refSize'}; + my $totalSize = length($refText); + my $noInserts = $refText; + $noInserts =~ s/-//g; + my $noInsertSize = length($noInserts); + @refNTs = (); + @refNTs = split(//, uc($refText)); + my $NTsCount = scalar(@refNTs); + my $insertCount = $totalSize - $noInsertSize; + $refPtr->{'inserts'} += $insertCount; + $refPtr->{'considered'} += $noInsertSize; + $totalBlockSize += $totalSize; + $totalInsertSize += $insertCount; + my @sampleRef = @refNTs[0..6]; +# printf "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\n", $refSrc, $refStart, $refSize, $refChrSize, $totalSize, $noInsertSize, $insertCount, $NTsCount, join('', @sampleRef); + } elsif ($line =~ m/^s /) { + my (undef, $qSrc, $qStart, $qSize, $qStrand, $qChrSize, $qText) = split('\s+', $line); + my $qAsmName = "n/a"; + my $qChrName = "n/a"; + if ($qSrc =~ m/^GC/) { + my @a = split(/\./, $qSrc); + $qChrName = join(".", @a[2..$#a]); + $qAsmName = join(".", @a[0..1]); + } else { + ($qAsmName, $qChrName) = split(/\./, $qSrc, 2); + } +# printf STDERR "# %s -> %s %s\n", $qSrc, $qAsmName, $qChrName; + my $refPtr = $refSummary{$srcChr}; + if (!defined($refPtr->{$qAsmName})) { + my %h; + $refPtr->{$qAsmName} = \%h; + } + my $qPtr = $refPtr->{$qAsmName}; + if (!defined($qPtr->{$qChrName})) { + my %h; + $h{'querySize'} = $qChrSize; + $qPtr->{$qChrName} = \%h; + } + my $qChrPtr = $qPtr->{$qChrName}; + if ($qChrPtr->{'querySize'} != $qChrSize) { + printf "ERROR: inconsistent query chr size, was $qChrPtr->{'querySize'} vs. now $qChrSize\n"; + exit 255; + } +# printf STDERR "# %s.%s\t%d\n", $qAsmName, $qChrName, $qChrPtr->{'querySize'}; + my $totalSize = length($qText); + my $noInserts = $qText; + $noInserts =~ s/-//g; + my $noInsertSize = length($noInserts); + my $insertCount = $totalSize - $noInsertSize; + my @qNTs = (); + @qNTs = split(//, uc($qText)); + die "unequal text sizes for $qSrc vs $srcChr" if (scalar(@qNTs) != scalar(@refNTs)); + my @sampleQ = @qNTs[0..6]; + my $matchedSequence = 0; + my $misMatch = 0; + my @matchRef = (); # reference sequence matching + my @matchQ = (); # query sequence matching + for (my $i = 0; $i < scalar(@qNTs); ++$i) { + next if ( ($refNTs[$i] eq "-") || ($qNTs[$i] eq "-") ); + push (@matchRef, $refNTs[$i]); + if ($refNTs[$i] eq $qNTs[$i]) { + push (@matchQ, "."); + ++$matchedSequence; + } else { + ++$misMatch; + push (@matchQ, $qNTs[$i]); + } + } + $qChrPtr->{'matched'} += $matchedSequence; + $qChrPtr->{'misMatched'} += $misMatch; + +# printf "%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\t%s\t%d\n", $qSrc, $qStart, $qSize, $qChrSize, $totalSize, $noInsertSize, $insertCount, scalar(@qNTs), join('', @sampleQ), $matchedSequence; +# printf "%s\n", join('', @matchRef); +# printf "%s\n", join('', @matchQ); + } +} + +close (MF); + +# my %refSummary; # key is chrom name for the reference sequence + # value is a hash pointer with keys: + # 'refSize' value is size of this reference chromosome + # 'query assembly name' value is a hash pointer with key: + # 'query sequence name' value is a hash pointer with keys: + # 'querySize' value is size of this query chromosome + # 'matched' value is amount of reference matched by this query + # 'misMatched' value is amount of mis-matches, query or target + +foreach my $refChr (sort keys %refSummary) { + my $refPtr = $refSummary{$refChr}; + foreach my $qAsmName (sort keys %$refPtr ) { + next if ($qAsmName eq "refSize"); + next if ($qAsmName eq "inserts"); + next if ($qAsmName eq "considered"); + my $qPtr = $refPtr->{$qAsmName}; + foreach my $qChrName (sort keys %$qPtr) { + my $qChrPtr = $qPtr->{$qChrName}; + my $refCoverPerCent = 100.0 * $refPtr->{'considered'} / $refPtr->{'refSize'}; + my $qCoveredRef = 100.0 * $qChrPtr->{'matched'} / $refPtr->{'considered'}; + my $qMisMatch = 100.0 * $qChrPtr->{'misMatched'} / $refPtr->{'considered'}; + printf "%s.%s\t%d %d %d %%%.2f\t%s.%s %d %d %d %%%.2f %%%.2f\n", $refDb, $refChr, $refPtr->{'refSize'}, $refPtr->{'inserts'}, $refPtr->{'considered'}, $refCoverPerCent, $qAsmName, $qChrName, $qChrPtr->{'querySize'}, $qChrPtr->{'matched'}, $qChrPtr->{'misMatched'}, $qCoveredRef, $qMisMatch; + } + } +} + +printf "# total blocks: %d, noInsertSize %d = total size: %d - inserts %d\n# average block size: %d = %d / %d\n", + $blockCount, $totalBlockSize - $totalInsertSize, $totalBlockSize, $totalInsertSize, $totalBlockSize / $blockCount, $totalBlockSize, $blockCount; + +printf "#refDb.chr size inserts considered refPerCent qAsmName.qChr qSize matched misMatched qCoveredRef%% qMisMatch%%\n"; + +vmPeak(); + +__END__ +# printf "%s.%s\t%d\t%s.%s\t%d\n", $refDb, $refChr, $refPtr->{'refSize'}, $qAsmName, $qChrName, $qChrPtr->{'querySize'};