4ca6162650c6e0f213cf674642b7a2b6bcea9bc1 hiram Fri Dec 27 00:50:12 2019 -0800 now automatically eliminating duplicate sequences refs #24354 diff --git src/hg/utils/automation/doAssemblyHub.pl src/hg/utils/automation/doAssemblyHub.pl index 7ca3a40..3e69f49 100755 --- src/hg/utils/automation/doAssemblyHub.pl +++ src/hg/utils/automation/doAssemblyHub.pl @@ -586,30 +586,44 @@ if [ ! -L \${asmId}_genomic.fna.gz -o \${asmId}_genomic.fna.gz -nt \$asmId.2bit ]; then rm -f \${asmId}_genomic.fna.gz \\ \${asmId}_assembly_report.txt \\ \${asmId}_rm.out.gz \\ \${asmId}_assembly_structure \\ \$asmId.2bit ln -s $assemblySource/\${asmId}_genomic.fna.gz . ln -s $assemblySource/\${asmId}_assembly_report.txt . ln -s $assemblySource/\${asmId}_rm.out.gz . if [ -d $assemblySource/\${asmId}_assembly_structure ]; then ln -s $assemblySource/\${asmId}_assembly_structure . fi faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit + twoBitDup -keyList=stdout \$asmId.2bit > \$asmId.dupCheck.txt + (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt + if [ -s "\$asmId.dups.txt" ]; then + printf "WARNING duplicate sequences found in \$asmId.2bit\\n" 1>&2 + grep "are identical" \$asmId.dupCheck.txt 1>&2 + awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list + mv \${asmId}_genomic.fna.gz \${asmId}_genomic.fna.dups.gz + faSomeRecords -exclude \${asmId}_genomic.fna.dups.gz \\ + \$asmId.remove.dups.list stdout | gzip -c > \${asmId}_genomic.fna.gz + rm -f \$asmId.2bit + faToTwoBit \${asmId}_genomic.fna.gz \$asmId.2bit + fi + rm -f \$asmId.dups.txt + gzip -f \$asmId.dupCheck.txt touch -r \${asmId}_genomic.fna.gz \$asmId.2bit else printf "# download step previously completed\\n" 1>&2 exit 0 fi _EOF_ ); $bossScript->execute(); } # doDownload ######################################################################### # * step: sequence [workhorse] sub doSequence { my $runDir = "$buildDir/sequence"; @@ -748,67 +762,73 @@ hgFakeAgp -minContigGap=1 -minScaffoldGap=50000 \$asmId.fa.gz stdout | gzip -c > \$asmId.fake.agp.gz zgrep "^>" \$asmId.fa.gz | sed -e 's/>//;' | sed -e 's/\\(.*\\)/\\1 \\1/;' | sed -e 's/v\\([0-9]\\+\\)/.\\1/;' | awk '{printf "%s\\t%s\\n", \$2, \$1}' > \$asmId.fake.names _EOF_ ); } else { printf STDERR "partsDone: %d\n", $partsDone; } $bossScript->add(<<_EOF_ zcat *.agp.gz | gzip > ../\$asmId.agp.gz faToTwoBit *.fa.gz ../\$asmId.2bit faToTwoBit -noMask *.fa.gz ../\$asmId.unmasked.2bit twoBitDup -keyList=stdout ../\$asmId.unmasked.2bit > \$asmId.dupCheck.txt (grep "are identical" \$asmId.dupCheck.txt || true) > \$asmId.dups.txt if [ -s "\$asmId.dups.txt" ]; then - printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\n" 1>&2 + printf "ERROR: duplicate sequences found in ../\$asmId.unmasked.2bit\\n" 1>&2 grep "are identical" \$asmId.dupCheck.txt 1>&2 - exit 255 -else - rm -f \$asmId.dups.txt + awk '{print \$3}' \$asmId.dups.txt > \$asmId.remove.dups.list + mv \$asmId.unmasked.2bit \$asmId.unmasked.dups.2bit + twoBitToFa \$asmId.unmasked.dups.2bit stdout | faSomeRecords -exclude \\ + stdin \$asmId.remove.dups.list stdout | gzip -c > \$asmId.noDups.fasta.gz + rm -f ../\$asmId.2bit ../\$asmId.unmasked.2bit + faToTwoBit \$asmId.noDups.fasta.gz ../\$asmId.2bit + faToTwoBit -noMask \$asmId.noDups.fasta.gz ../\$asmId.unmasked.2bit fi +rm -f \$asmId.dups.txt +gzip -f \$asmId.dupCheck.txt touch -r ../download/\$asmId.2bit ../\$asmId.2bit touch -r ../download/\$asmId.2bit ../\$asmId.unmasked.2bit touch -r ../download/\$asmId.2bit ../\$asmId.agp.gz twoBitInfo ../\$asmId.2bit stdout | sort -k2nr > ../\$asmId.chrom.sizes touch -r ../\$asmId.2bit ../\$asmId.chrom.sizes # verify everything is there twoBitInfo ../download/\$asmId.2bit stdout | sort -k2nr > source.\$asmId.chrom.sizes export newTotal=`ave -col=2 ../\$asmId.chrom.sizes | grep "^total"` export oldTotal=`ave -col=2 source.\$asmId.chrom.sizes | grep "^total"` if [ "\$newTotal" != "\$oldTotal" ]; then - printf "# ERROR: sequence construction error: not same totals source vs. new:\n" 1>&2 - printf "# \$newTotal != \$oldTotal\n" 1>&2 + printf "# ERROR: sequence construction error: not same totals source vs. new:\\n" 1>&2 + printf "# \$newTotal != \$oldTotal\\n" 1>&2 exit 255 fi rm source.\$asmId.chrom.sizes export checkAgp=`checkAgpAndFa ../\$asmId.agp.gz ../\$asmId.2bit 2>&1 | tail -1` if [ "\$checkAgp" != "All AGP and FASTA entries agree - both files are valid" ]; then - printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\n" 1>&2 + printf "# ERROR: checkAgpAndFa \$asmId.agp.gz \$asmId.2bit failing\\n" 1>&2 exit 255 fi join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$3,\$2,\$1,\$2}' > \$asmId.ncbiToUcsc.lift join -t\$'\\t' <(sort ../\$asmId.chrom.sizes) <(sort \${asmId}*.names) | awk '{printf "0\\t%s\\t%d\\t%s\\t%d\\n", \$1,\$2,\$3,\$2}' > \$asmId.ucscToNcbi.lift export c0=`cat \$asmId.ncbiToUcsc.lift | wc -l` export c1=`cat \$asmId.ucscToNcbi.lift | wc -l` export c2=`cat ../\$asmId.chrom.sizes | wc -l` # verify all names are accounted for -if [ "\$c0" -ne "\$c2 ]; then +if [ "\$c0" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ncbiToUcsc.lift" 1>&2 exit 255 fi -if [ "\$c1" -ne "\$c2 ]; then +if [ "\$c1" -ne "\$c2" ]; then printf "# ERROR: not all names accounted for in \$asmId.ucscToNcbi.lift" 1>&2 exit 255 fi twoBitToFa ../\$asmId.2bit stdout | faCount stdin | gzip -c > \$asmId.faCount.txt.gz touch -r ../\$asmId.2bit \$asmId.faCount.txt.gz zgrep -P "^total\t" \$asmId.faCount.txt.gz > \$asmId.faCount.signature.txt touch -r ../\$asmId.2bit \$asmId.faCount.signature.txt _EOF_ ); $bossScript->execute(); } # doSequence ######################################################################### # * step: assemblyGap [workhorse] sub doAssemblyGap {