src/hg/utils/otto/sarscov2phylo/getCncb.sh d48d0f53ee67f93d7193241cd226c8aa039cde42

d48d0f53ee67f93d7193241cd226c8aa039cde42
angie
  Fri Dec 20 15:37:20 2024 -0800
Uniquify to guard against occasional duplicates.

diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh
index 6af25fc..cd3cd7f 100755
--- src/hg/utils/otto/sarscov2phylo/getCncb.sh
+++ src/hg/utils/otto/sarscov2phylo/getCncb.sh
@@ -43,54 +43,57 @@
 curlRetry $metadataUrl \
 | cut -f 1-4,6- \
 | perl -wpe 's/[^[:print:]^\t^\n]+/?/g;' \
     > cncb.metadata.tsv
 
 colCount=$(head -1 cncb.metadata.tsv | tawk '{print NF;}')
 if [[ $colCount != 16 ]]; then
     echo "Metadata format error: expected 16 columns, got $colCount"
     exit 1
 fi
 
 # Make a cncbToDate file for ID mapping.
 tail -n+2 cncb.metadata.tsv \
 | cut -f 1,10 \
 | cleanCncb \
+| sort -u \
 | sed -re 's/ //;' \
     > cncbToDate
 
 # Make a renaming file to translate from accessions to the 'name | acc' headers expected by
 # Chris's ID-matching pipeline.  Exclude sequences that are also in GenBank.  If for some reason
 # the EPI_ ID is listed as primary and CNCB as secondary, swap them.
 tail -n+2 cncb.metadata.tsv \
 | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
 | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $1, $2; }' \
 | cleanCncb \
+| sort -u \
 | sed -re 's/ //;' \
 | tawk '{print $2, $1 " | " $2;}' \
 | sort \
     > accToNameBarAcc.tsv
 
 # See what sequences are in metadata but that we haven't already downloaded.
-fastaNames ../cncb.latest/cncb.nonGenBank.acc.fasta.xz | sort > prevAccs
-comm -23 <(cut -f 1 accToNameBarAcc.tsv) prevAccs | sort > missingIDs
+fastaNames ../cncb.latest/cncb.nonGenBank.acc.fasta.xz | sort -u > prevAccs
+comm -23 <(cut -f 1 accToNameBarAcc.tsv) prevAccs | sort -u > missingIDs
 
 # Use the GenBase API to download missing GenBase sequences, unfortunately only one sequence
 # at a time.
 tail -n+2 cncb.metadata.tsv \
 | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
 | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \
+| sort -u \
     > nonGenBankIds
 set +o pipefail
 grep ^C_ missingIDs \
 | grep -Fwf - nonGenBankIds \
 | cat \
     > missingGenBaseIds
 set -o pipefail
 for acc in $(cat missingGenBaseIds); do
     curlRetry "$genBaseFileApiBase$acc" \
     | sed -re '/^>/  s/ .*//;'
     sleep 5
 done > new.accs.fa
 
 cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \
 | faUniqify stdin stdout \