4ca6162650c6e0f213cf674642b7a2b6bcea9bc1
hiram
  Fri Dec 27 00:50:12 2019 -0800
now automatically eliminating duplicate sequences refs #24354

diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl
index 7ca3a40..3e69f49 100755
--- src/hg/utils/automation/doAssemblyHub.pl
+++ src/hg/utils/automation/doAssemblyHub.pl
@@ -586,30 +586,44 @@
 
 if [ ! -L \${asmId}_genomic.fna.gz -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then
   rm -f \${asmId}_genomic.fna.gz \\
     \${asmId}_assembly_report.txt \\
     \${asmId}_rm.out.gz \\
     \${asmId}_assembly_structure \\
     \$asmId.2bit
 
   ln -s $assemblySource/\${asmId}_genomic.fna.gz .
   ln -s $assemblySource/\${asmId}_assembly_report.txt .
   ln -s $assemblySource/\${asmId}_rm.out.gz .
   if [ -d $assemblySource/\${asmId}_assembly_structure ]; then
     ln -s $assemblySource/\${asmId}_assembly_structure .
   fi
   faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
+  twoBitDup -keyList=stdout \$asmId.2bit > \$asmId.dupCheck.txt
+  (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
+  if [ -s "\$asmId.dups.txt" ]; then
+    printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2
+    grep "are identical" \$asmId.dupCheck.txt 1>&2
+    awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list
+    mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz
+    faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\
+      \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz
+    rm -f \$asmId.2bit
+    faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit
+  fi
+  rm -f \$asmId.dups.txt
+  gzip -f \$asmId.dupCheck.txt
   touch -r \${asmId}_genomic.fna.gz \$asmId.2bit
 else
   printf "# download step previously completed\\n" 1>&2
   exit 0
 fi
 _EOF_
   );
   $bossScript->execute();
 } # doDownload
 
 
 #########################################################################
 # * step: sequence [workhorse]
 sub doSequence {
   my $runDir = "$buildDir/sequence";
@@ -748,67 +762,73 @@
 hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz
 zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names
 _EOF_
     );
 } else {
 printf STDERR "partsDone: %d\n", $partsDone;
   }
 
   $bossScript->add(<<_EOF_
 zcat *.agp.gz | gzip > ../\$asmId.agp.gz
 faToTwoBit *.fa.gz ../\$asmId.2bit
 faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit
 twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt
 (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt
 if [ -s "\$asmId.dups.txt" ]; then
-  printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2
+  printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2
   grep "are identical" \$asmId.dupCheck.txt 1>&2
-  exit 255
-else
-  rm -f \$asmId.dups.txt
+  awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list
+  mv \$asmId.unmasked.2bit \$asmId.unmasked.dups.2bit
+  twoBitToFa \$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\
+    stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz
+  rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit
+  faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit
+  faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit
 fi
+rm -f \$asmId.dups.txt
+gzip -f \$asmId.dupCheck.txt
 touch -r ../download/\$asmId.2bit ../\$asmId.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit
 touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz
 twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes
 touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes
 # verify everything is there
 twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes
 export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"`
 export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"`
 if [ "\$newTotal" != "\$oldTotal" ]; then
-  printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2
-  printf "# \$newTotal != \$oldTotal\n" 1>&2
+  printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2
+  printf "# \$newTotal != \$oldTotal\\n" 1>&2
   exit 255
 fi
 rm source.\$asmId.chrom.sizes
 export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1`
 if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then
-  printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2
+  printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2
   exit 255
 fi
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift
 join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift
 export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l`
 export c1=`cat \$asmId.ucscToNcbi.lift | wc -l`
 export c2=`cat ../\$asmId.chrom.sizes | wc -l`
 # verify all names are accounted for
-if [ "\$c0" -ne "\$c2 ]; then
+if [ "\$c0" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2
   exit 255
 fi
-if [ "\$c1" -ne "\$c2 ]; then
+if [ "\$c1" -ne "\$c2" ]; then
   printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2
   exit 255
 fi
 twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz
 touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz
 zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt
 touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt
 _EOF_
   );
   $bossScript->execute();
 } # doSequence
 
 #########################################################################
 # * step: assemblyGap [workhorse]
 sub doAssemblyGap {