src/hg/utils/automation/doAssemblyHub.pl 06d156f363b5dc4c4aa59b90084724822ee7781a

06d156f363b5dc4c4aa59b90084724822ee7781a
hiram
  Mon Jan 6 15:22:09 2020 -0800
correctly handle duplicates and faster refs #23891

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index c3764fb..b39de80 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -60,30 +60,33 @@
 # Option defaults:
 my $dbHost = 'hgwdev';
 my $sourceDir = "/hive/data/outside/ncbi/genomes";
 my $augustusSpecies = "human";
 my $xenoRefSeq = "/hive/data/genomes/asmHubs/VGP/xenoRefSeq";
 my $ucscNames = 0;  # default 'FALSE' (== 0)
 my $asmHubName = "n/a";  # directory name in: /gbdb/hubs/asmHubName
 my $workhorse = "hgwdev";  # default workhorse when none chosen
 my $fileServer = "hgwdev";  # default when none chosen
 my $bigClusterHub = "ku";  # default when none chosen
 my $smallClusterHub = "ku";  # default when none chosen
 
 my $base = $0;
 $base =~ s/^(.*\/)?//;
 
+# key is original accession name from the remove.dups.list, value is 1
+my %dupAccessionList = {};
+
 sub usage {
   # Usage / help / self-documentation:
   my ($status, $detailed) = @_;
   # Basic help (for incorrect usage):
   print STDERR "
 usage: $base [options] genbank|refseq subGroup species asmId
 required arguments:
     genbank|refseq - specify either genbank or refseq hierarchy source
     subGroup       - specify subGroup at NCBI FTP site, examples:
                    - vertebrate_mammalian vertebrate_other plant etc...
     species        - species directory at NCBI FTP site, examples:
                    - Homo_sapiens Mus_musculus etc...
     asmId          - assembly identifier at NCBI FTP site, examples:
                    - GCF_000001405.32_GRCh38.p6 GCF_000001635.24_GRCm38.p4 etc..
 
@@ -180,30 +183,45 @@
 		      @HgAutomate::commonOptionSpec,
 		      );
   &usage(1) if (!$ok);
   &usage(0, 1) if ($opt_help);
   &HgAutomate::processCommonOptions();
   my $err = $stepper->processOptions();
   usage(1) if ($err);
   $dbHost = $opt_dbHost if ($opt_dbHost);
 }
 
 #########################################################################
 #########################################################################
 #  assistant subroutines here.  The 'do' steps follow this section
 #########################################################################
 #########################################################################
+# if there are duplicates in original sequence, read in that list for use here
+sub readDupsList() {
+  my $downloadDir = "$buildDir/download";
+
+  if ( -s "${downloadDir}/${asmId}.remove.dups.list" ) {
+    open (FH, "<${downloadDir}/${asmId}.remove.dups.list") or die "can not read ${downloadDir}/${asmId}.remove.dups.list";
+    while (my $accession = <FH>) {
+      chomp $accession;
+      $dupAccessionList{$accession} = 1;
+    }
+    close (FH);
+  }
+}
+
+#########################################################################
 #  return true if result does not exist or it is older than source
 #  else return false
 sub needsUpdate($$) {
   my ($source, $result) = @_;
   if (-s $result) {
      if (stat($source)->mtime > stat($result)->mtime ) {
        return 1;
      } else {
        return 0;
      }
   }
   return 1;
 }
 
 #########################################################################
@@ -254,32 +272,34 @@
 #########################################################################
 # process NCBI fasta sequence, use UCSC chrom names if requested
 sub compositeFasta($$$) {
   my ($chr2acc, $twoBitFile, $fastaOut) = @_;
 
   my %accToChr;
   readChr2Acc($chr2acc, \%accToChr);
   if (! $ucscNames) {
     foreach my $acc (keys %accToChr) {
        $accToChr{$acc} = $acc;
     }
   }
 
   my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1);
   foreach my $acc (sort keys %accToChr) {
+     if (! exists($dupAccessionList{$acc})) {
        printf $fh "%s\n", $acc;
      }
+  }
   close $fh;
 
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile";
   while (my $line = <FH>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($accToChr{$line}));
       if (! $ucscNames) {
          printf FA ">%s\n", $accToChr{$line};
       } else {
         if ( $accToChr{$line} eq "MT" ) {
           printf FA ">chrM\n";
@@ -314,30 +334,31 @@
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
   my %chrNDone;	# key is chrom name, value is 1 when done
   foreach my $acc (keys %accToChr) {
     my $chrN = $accToChr{$acc};
     next if (exists($chrNDone{$chrN}));
     $chrNDone{$chrN} = 1;
     my $agpFile =  "$agpSource/chr$chrN.unlocalized.scaf.agp.gz";
     open (FH, "zcat $agpFile|") or die "can not read $agpFile";
     while (my $line = <FH>) {
       if ($line !~ m/^#/) {
          chomp $line;
          my (@a) = split('\t', $line);
          my $ncbiAcc = $a[0];
+         next if (exists($dupAccessionList{$ncbiAcc}));
          my $acc = $ncbiAcc;
          $acc =~ s/\./v/ if ($ucscNames);
          die "ERROR: chrN $chrN not correct for $acc"
              if ($accToChr{$acc} ne $chrN);
          my $ucscName = "${acc}";
          $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames);
          printf NAMES "%s\t%s\n", $ucscName, $ncbiAcc;
          printf AGP "%s", $ucscName;    # begin AGP line with accession name
          for (my $i = 1; $i < scalar(@a); ++$i) {   # the rest of the AGP line
              printf AGP "\t%s", $a[$i];
          }
          printf AGP "\n";
       }
     }
     close (FH);
@@ -349,32 +370,34 @@
 #########################################################################
 # process unlocalized NCBI fasta sequence, use UCSC chrom names if requested
 sub unlocalizedFasta($$$) {
   my ($chr2acc, $twoBitFile, $fastaOut) = @_;
 
   my %accToChr;
   readChr2Acc($chr2acc, \%accToChr);
   if (! $ucscNames) {
     foreach my $acc (keys %accToChr) {
        $accToChr{$acc} = $acc;
     }
   }
 
   my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1);
   foreach my $acc (sort keys %accToChr) {
+     if (! exists($dupAccessionList{$acc})) {
        printf $fh "%s\n", $acc;
      }
+  }
   close $fh;
 
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile";
   while (my $line = <FH>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($accToChr{$line}));
       my $chrN = $accToChr{$line};
       my $acc = $line;
       $acc =~ s/\./v/ if ($ucscNames);
       my $ucscName = "${acc}";
       $ucscName = "chr${chrN}_${acc}_random" if ($ucscNames);
@@ -412,30 +435,31 @@
   }
   close (AP);
 }	#	sub readAltPlacement($$)
 
 #########################################################################
 ### process one of the alternate AGP files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 sub processAltAgp($$$) {
   my ($agpFile, $nameHash, $fh) = @_;
   open (AG, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <AG>) {
     next if ($line =~ m/^#/);
     chomp $line;
     my (@a) = split('\t', $line);
     my $ncbiAcc = $a[0];
+    next if (exists($dupAccessionList{$ncbiAcc}));
     my $ucscName = $nameHash->{$ncbiAcc};
     printf $fh "%s", $ucscName;  # begin AGP line with accession nam
     for (my $i = 1; $i < scalar(@a); ++$i) {  # the reset of the AGP line
       printf $fh "\t%s", $a[$i];
     }
     printf $fh "\n";
   }
   close (AG);
 }	#	sub processAltAgp($$$)
 
 #########################################################################
 ### process one of the alternate FASTA files, changing names via the nameHash
 ### and writing to the given fileHandle (fh)
 
 sub processAltFasta($$$) {
@@ -455,31 +479,30 @@
   }
   close (FF);
 }	#	sub processAltFasta($$$)
 
 #########################################################################
 # there are alternate sequences, process their multiple AGP and FASTA files
 # into single AGP and FASTA files
 sub altSequences($) {
   my $runDir = "$buildDir/sequence";
   my ($assemblyStructure) = @_;
   # construct the name correspondence hash #################################
   my %accToChr;	# hash for accession name to browser chromosome name
   open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/alt_scaffold_placement.txt'|") or die "can not find alt_scaffolds in ${assemblyStructure}";;
   while (my $line = <FH>) {
    chomp $line;
-###   printf "%s\n", $line;
    readAltPlacement($line, \%accToChr);
   }
   close (FH);
   # and write to alt.names
   open (AN, ">$runDir/$asmId.alt.names");
   foreach my $acc (sort keys %accToChr) {
     printf AN "%s\t%s\n", $accToChr{$acc}, $acc;
   }
   close (AN);
   # process the AGP files, writing the agpOutput file ######################
   my $agpOutput = "$runDir/$asmId.alt.agp.gz";
   open (AGP, "|gzip -c > $agpOutput") or die "can not write to $agpOutput";
   open (FH, "find -L '${assemblyStructure}' -type f | grep '/alt_scaffolds/AGP/alt.scaf.agp.gz'|") or die "can not find alt.scaf.agp.gz in ${assemblyStructure}";;
   while (my $agpFile = <FH>) {
     chomp $agpFile;
@@ -503,32 +526,33 @@
 
 #########################################################################
 # process NCBI unplaced AGP file, perhaps translate into UCSC naming scheme
 #   the agpNames result file is a naming correspondence file for later use
 #   optional chrPrefix can be "chrUn_" for assemblies that have other chr names
 sub unplacedAgp($$$$) {
   my ($agpFile, $agpOutput, $agpNames, $chrPrefix) = @_;
 
   open (AGP, "|gzip -c >$agpOutput") or die "can not write to $agpOutput";
   open (NAMES, "|sort -u >$agpNames") or die "can not write to $agpNames";
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <FH>) {
     if ($line =~ m/^#/) {
         print AGP $line;
     } else {
-        if ($ucscNames) {
         my ($ncbiAcc, undef) = split('\s+', $line, 2);
+        next if (exists($dupAccessionList{$ncbiAcc}));
+        if ($ucscNames) {
           my $ucscAcc = $ncbiAcc;
           $ucscAcc =~ s/\./v/;
           printf NAMES "%s%s\t%s\n", $chrPrefix, $ucscAcc, $ncbiAcc;
           $line =~ s/\./v/;
         }
         printf AGP "%s%s", $chrPrefix, $line;
     }
   }
   close (FH);
   close (NAMES);
   close (AGP);
 }	# sub unplacedAgp($$$$)
 
 #########################################################################
 # process NCBI unplaced FASTA file, perhaps translate into UCSC naming scheme
@@ -538,32 +562,34 @@
 
   my %contigName;  # key is NCBI contig name, value is UCSC contig name
   open (FH, "zcat $agpFile|") or die "can not read $agpFile";
   while (my $line = <FH>) {
     if ($line !~ m/^#/) {
         my ($ncbiAcc, undef) = split('\s+', $line, 2);
         my $ucscAcc = $ncbiAcc;
         $ucscAcc =~ s/\./v/ if ($ucscNames);
         $contigName{$ncbiAcc} = $ucscAcc;
     }
   }
   close (FH);
 
   my ($fh, $tmpFile) = tempfile("seqList_XXXX", DIR=>'/dev/shm', SUFFIX=>'.txt', UNLINK=>1);
   foreach my $ctg (sort keys %contigName) {
+     if (! exists($dupAccessionList{$ctg})) {
        printf $fh "%s\n", $ctg;
      }
+  }
   close $fh;
 
   open (FA, "|gzip -c > $fastaOut") or die "can not write to $fastaOut";
   open (FH, "twoBitToFa -seqList=$tmpFile $twoBitFile stdout|") or die "can not twoBitToFa $twoBitFile";
   while (my $line = <FH>) {
     if ($line =~ m/^>/) {
       chomp $line;
       $line =~ s/^>//;
       $line =~ s/ .*//;
       die "ERROR: twoBitToFa $twoBitFile returns unknown acc $line" if (! exists($contigName{$line}));
       printf FA ">%s%s\n", $chrPrefix, $contigName{$line};
     } else {
       print FA $line;
     }
   }
@@ -576,66 +602,68 @@
 # do Steps section
 #########################################################################
 #########################################################################
 # * step: download [workhorse]
 sub doDownload {
   my $runDir = "$buildDir/download";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "setup work hierarchy of sym links to source files in\n\t$runDir/";
   my $bossScript = newBash HgRemoteScript("$runDir/doDownload.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   $bossScript->add(<<_EOF_
 export asmId=$asmId
 
-if [ ! -L \${asmId}_genomic.fna.gz -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then
+if [ ! -s \${asmId}.2bit -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then
   rm -f \${asmId}_genomic.fna.gz \\
+    \${asmId}_genomic.fna.dups.gz \\
     \${asmId}_assembly_report.txt \\
     \${asmId}_rm.out.gz \\
     \${asmId}_assembly_structure \\
     \$asmId.2bit
 
   ln -s $assemblySource/\${asmId}_genomic.fna.gz .
   ln -s $assemblySource/\${asmId}_assembly_report.txt .
   ln -s $assemblySource/\${asmId}_rm.out.gz .
   if [ -d $assemblySource/\${asmId}_assembly_structure ]; then
     ln -s $assemblySource/\${asmId}_assembly_structure .
   fi
   faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
-  twoBitDup -keyList=stdout \$asmId.2bit > \$asmId.dupCheck.txt
-  (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
+  twoBitDup \$asmId.2bit > \$asmId.dups.txt
   if [ -s "\$asmId.dups.txt" ]; then
     printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2
-    grep "are identical" \$asmId.dupCheck.txt 1>&2
-    awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list
+    cat \$asmId.dups.txt 1>&2
+    awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
     mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz
     faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
       \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
     rm -f \$asmId.2bit
     faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
   fi
-  rm -f \$asmId.dups.txt
-  gzip -f \$asmId.dupCheck.txt
+  gzip -f \$asmId.dups.txt
   touch -r \${asmId}_genomic.fna.gz \$asmId.2bit
 else
   printf "# download step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
+
+  readDupsList();
+
 } # doDownload
 
 
 #########################################################################
 # * step: sequence [workhorse]
 sub doSequence {
   my $runDir = "$buildDir/sequence";
   &HgAutomate::mustMkdir($runDir);
 
   my $whatItDoes = "process source files into 2bit sequence and agp";
   my $bossScript = newBash HgRemoteScript("$runDir/doSequence.bash", $workhorse,
 				      $runDir, $whatItDoes);
 
   my $twoBitFile = "$buildDir/download/$asmId.2bit";
   my $otherChrParts = 0;  # to see if this is unplaced scaffolds only
@@ -763,45 +791,43 @@
 printf STDERR "creating fake AGP\n";
     $bossScript->add(<<_EOF_
 twoBitToFa ../download/\$asmId.2bit stdout | sed -e "s/\\.\\([0-9]\\+\\)/v\\1/;" | gzip -c > \$asmId.fa.gz
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
     );
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
-twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt
-(grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
+twoBitDup ../\$asmId.unmasked.2bit > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
   printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
-  grep "are identical" \$asmId.dupCheck.txt 1>&2
-  awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list
-  mv \$asmId.unmasked.2bit \$asmId.unmasked.dups.2bit
-  twoBitToFa \$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
+  cat \$asmId.dups.txt 1>&2
+  awk '{print \$1}' \$asmId.dups.txt > \$asmId.remove.dups.list
+  mv ../\$asmId.unmasked.2bit ../\$asmId.unmasked.dups.2bit
+  twoBitToFa ../\$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
     stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz
   rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit
   faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit
   faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit
 fi
-rm -f \$asmId.dups.txt
-gzip -f \$asmId.dupCheck.txt
+gzip -f \$asmId.dups.txt
 touch -r ../download/\$asmId.2bit ../\$asmId.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz
 twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes
 touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
 export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
   printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2
   printf "# \$newTotal != \$oldTotal\\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes