4e32052e618df259f0c0d77434641db7793e41d1
hiram
  Sun Nov 29 23:24:26 2020 -0800
correctly remove duplicate sequences from NCBI rm.out refs #24396

diff --git src/hg/utils/automation/doRepeatMasker.pl src/hg/utils/automation/doRepeatMasker.pl
index b823e36..35bf619 100755
--- src/hg/utils/automation/doRepeatMasker.pl
+++ src/hg/utils/automation/doRepeatMasker.pl
@@ -16,82 +16,86 @@
 use HgStepManager;
 
 # Hardcoded command path:
 my $RepeatMaskerPath = "/hive/data/staging/data/RepeatMasker";
 my $RepeatMasker = "$RepeatMaskerPath/RepeatMasker";
 my $RepeatMaskerEngine = "-engine crossmatch -s";
 # Let parasol pick defaults
 my $parasolRAM = "";
 
 # Option variable names, both common and peculiar to this script:
 use vars @HgAutomate::commonOptionVars;
 use vars @HgStepManager::optionVars;
 use vars qw/
     $opt_buildDir
     $opt_ncbiRmsk
+    $opt_dupList
     $opt_liftSpec
     $opt_species
     $opt_unmaskedSeq
     $opt_customLib
     $opt_useHMMER
     $opt_useRMBlastn
     $opt_splitTables
     $opt_noSplit
     $opt_updateTable
     /;
 
 # Specify the steps supported with -continue / -stop:
 my $stepper = new HgStepManager(
     [ { name => 'cluster', func => \&doCluster },
       { name => 'cat',     func => \&doCat },
       { name => 'mask',    func => \&doMask },
       { name => 'install', func => \&doInstall },
       { name => 'cleanup', func => \&doCleanup },
     ]
 				);
 
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $unmaskedSeq = "\$db.unmasked.2bit";
 my $defaultSpecies = 'scientificName from dbDb';
 my $ncbiRmsk = "";	# path to NCBI *_rm.out.gz file to use NCBI result
                         # for repeat masking result
                         # in the assembly as calculated by NCBI
+my $dupList = "";	# path to download/asmId.remove.dups.list for NCBI
 my $liftSpec = "";	# lift file from NCBI coordinates to UCSC coordinates
                         # usually required with ncbiRmsk
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base db
 options:
 ";
   print STDERR $stepper->getOptionHelp();
   print STDERR <<_EOF_
     -buildDir dir         Use dir instead of default
                           $HgAutomate::clusterData/\$db/$HgAutomate::trackBuild/RepeatMasker.\$date
                           (necessary when continuing at a later date).
     -species sp           Use sp (which can be quoted genus and species, or
                           a common name that RepeatMasker recognizes.
                           Default: $defaultSpecies.
     -ncbiRmsk path/file_rm.out.gz - Use the repeat masker result supplied in
                           the assembly as calculated by NCBI
+    -dupList .../download/asmId.remove.dups.list - to remove duplicates from
+			  NCBI repeat masker file
     -liftSpec path/file.lift - Use this lift file to lift the NCBI coordinates
                           to UCSC coordinates, usually used with ncbiRmsk
     -unmaskedSeq seq.2bit Use seq.2bit as the unmasked input sequence instead
                           of default ($unmaskedSeq).
     -customLib lib.fa     Use custom repeat library instead of RepeatMaskers\'s.
     -useRMBlastn          Use NCBI rmblastn instead of crossmatch
     -useHMMER             Use hmmer instead of crossmatch ( currently for human only )
     -updateTable          load into table name rmskUpdate (default: rmsk)
     -splitTables          split the _rmsk tables (default is not split)
     -noSplit              default behavior, this option no longer required.
 _EOF_
   ;
   print STDERR &HgAutomate::getCommonOptionHelp('dbHost' => $dbHost,
 						'workhorse' => '',
 						'bigClusterHub' => '');
@@ -119,30 +123,31 @@
 }
 
 
 # Globals:
 # Command line args: db
 my ($db);
 # Other:
 my ($buildDir, $chromBased, $updateTable, $secondsStart, $secondsEnd);
 
 sub checkOptions {
   # Make sure command line options are valid/supported.
   my $ok = GetOptions(@HgStepManager::optionSpec,
 		      'buildDir=s',
 		      'species=s',
 		      'ncbiRmsk=s',
+		      'dupList=s',
 		      'liftSpec=s',
 		      'unmaskedSeq=s',
 		      'customLib=s',
                       'useRMBlastn',
                       'useHMMER',
 		      'splitTables',
 		      'noSplit',
 		      'updateTable',
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
@@ -454,40 +459,56 @@
     if (! -s "${liftSpec}" ) {
     printf STDERR "# WARNING: no liftSpec given with ncbiRmsk file, probably will need one ?\n";
     $bossScript->add(<<_EOF_
 #########
 # WARNING: no liftSpec given with ncbiRmsk file, probably will need one ?
 #########
 
 printf "   SW  perc perc perc  query      position in query           matching       repeat              position in  repeat
 score  div. del. ins.  sequence    begin     end    (left)    repeat         class/family         begin  end (left)   ID
 
 " >> "\${db}.sorted.fa.out"
 _EOF_
     );
     }
     if ($zippedSource) {
+      if ( -s "${dupList}" ) {
+        $bossScript->add(<<_EOF_
+zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts \\
+   | grep -v -f ${dupList} >> "\${db}.sorted.fa.out"
+_EOF_
+        );
+      } else {
         $bossScript->add(<<_EOF_
 zcat "\${ncbiRmOutFile}" | tail -n +4 | sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
 _EOF_
         );
+      }
+    } else {
+      if ( -s "${dupList}" ) {
+        $bossScript->add(<<_EOF_
+tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts \\
+    | grep -v -f ${dupList} >> "\${db}.sorted.fa.out"
+_EOF_
+        );
       } else {
         $bossScript->add(<<_EOF_
 tail -n +4 "\${ncbiRmOutFile}" sort -k5,5 -k6,6n $liftOpts >> "\${db}.sorted.fa.out"
 _EOF_
         );
       }
+    }
   } else {	# not using the NCBI supplied file, use local cluster run result
   my $partDir = "$buildDir/RMPart";
   my $levels = $opt_debug ? 1 : `cat $partDir/levels`;
   chomp $levels;
   # Use symbolic link created in cluster step:
   my $indent = "";
   my $path = "";
   # Make the appropriate number of nested foreach's to match the number
   # of levels from the partitioning.  If $levels is 1, no foreach required.
   for (my $l = $levels - 2;  $l >= 0;  $l--) {
     my $dir = ($l == ($levels - 2)) ? $partDir : "\$d" . ($l + 1);
     $bossScript->add(<<_EOF_
 ${indent}for d$l in $dir/???
 do
 _EOF_
@@ -716,30 +737,31 @@
 $secondsStart = `date "+%s"`;
 chomp $secondsStart;
 ($db) = @ARGV;
 
 # Now that we know the $db, figure out our paths:
 my $date = `date +%Y-%m-%d`;
 chomp $date;
 $buildDir = $opt_buildDir ? $opt_buildDir :
   "$HgAutomate::clusterData/$db/$HgAutomate::trackBuild/RepeatMasker.$date";
 $unmaskedSeq = $opt_unmaskedSeq ? $opt_unmaskedSeq :
   "$HgAutomate::clusterData/$db/$db.unmasked.2bit";
 my $seqCount = `twoBitInfo $unmaskedSeq stdout | wc -l`;
 $chromBased = ($seqCount <= $HgAutomate::splitThreshold) && $opt_splitTables;
 $updateTable = $opt_updateTable ? "Update" : "";
 $ncbiRmsk = $opt_ncbiRmsk ? $opt_ncbiRmsk : "";
+$dupList = $opt_dupList ? $opt_dupList : "";
 $liftSpec = $opt_liftSpec ? $opt_liftSpec : "";
 
 # Do everything.
 $stepper->execute();
 
 # Tell the user anything they should know.
 my $stopStep = $stepper->getStopStep();
 my $upThrough = ($stopStep eq 'cleanup') ? "" :
   "  (through the '$stopStep' step)";
 
 $secondsEnd = `date "+%s"`;
 chomp $secondsEnd;
 my $elapsedSeconds = $secondsEnd - $secondsStart;
 my $elapsedMinutes = int($elapsedSeconds/60);
 $elapsedSeconds -= $elapsedMinutes * 60;