ab911e338fecf626957f119d86bea1075ec0bcfd angie Fri Nov 1 14:11:47 2019 -0700 Oops, forgot to commit categorizeBigDbSnp.pl along with doBigDbSnp.pl when adding subset tracks. refs #23283 diff --git src/hg/utils/automation/categorizeBigDbSnp.pl src/hg/utils/automation/categorizeBigDbSnp.pl new file mode 100755 index 0000000..0ce2c45 --- /dev/null +++ src/hg/utils/automation/categorizeBigDbSnp.pl @@ -0,0 +1,80 @@ +#!/usr/bin/env perl + +# DO NOT EDIT the /cluster/bin/scripts copy of this file -- +# edit ~/kent/src/hg/utils/automation/categorizeBigDbSnp.pl instead. + +use warnings; +use strict; + +# Use 1000Genomes frequencies for determining whether a variant is "Common" or not, as dbSNP does. +# By convention, 1000Genomes is always first in freqSourceOrder. +my $freqSourceIndex = 0; + +my %rc = ( 'A' => 'T', + 'C' => 'G', + 'G' => 'C', + 'T' => 'A', + 'N' => 'N', + ); + +sub revComp($) { + # Reverse-complement a nucleotide sequence, possible containing N's. + my ($seq) = @_; + die "How do I revComp '$seq'?" unless ($seq =~ /^[ACGTN]+$/); + my $rcSeq = ""; + while ($seq =~ s/(.)$//) { + $rcSeq .= $rc{$1}; + } + return $rcSeq; +} # revComp + + +# Main: open output files and assign each input file to one of them. + +my $prefix = shift @ARGV; + +die "Expecting at least one argument: prefix [input.bigDbSnp]\n" if (! $prefix); + +my ($multCount, $comCount, $clinCount, $clinComCount, $miscCount) = (0,0,0,0); +open(my $mult, "> $prefix.Mult.bigDbSnp") || die; +open(my $common, "> $prefix.Common.bigDbSnp") || die; +open(my $clinVar, "> $prefix.ClinVar.bigDbSnp") || die; +while (<>) { + my @w = split("\t"); + my $ucscNotes = $w[14]; + if ($ucscNotes =~ /multiMap/) { + print $mult $_; + $multCount++; + } else { + my $isClinVar = ($ucscNotes =~ /clinvar/); + my $isCommon = 0; + my @freqs = split(",", $w[9]); + my $freq = $freqs[$freqSourceIndex]; + if ($freq && $freq !~ /nan/ && $freq !~ /inf/) { + if ($freq > 0.01 || + ($freq == 0.0 && $ucscNotes =~ /refIsSingleton/)) { + $isCommon = 1; + } + } + if ($isCommon) { + print $common $_; + $comCount++; + } + if ($isClinVar) { + print $clinVar $_; + $clinCount++; + } + if ($isCommon && $isClinVar) { + $clinComCount++; + } + if (! ($isCommon || $isClinVar)) { + $miscCount++; + } + } +} +close($mult); close($common); close($clinVar); +print "Mult: $multCount\n" + . "Common: $comCount\n" + . "ClinVar: $clinCount\n" + . "leftover: $miscCount\n" + . "Common and ClinVar: $clinComCount\n";