ac8238545ef1708bfafa03e6b9c7daa0352b1b76
angie
  Sat Dec 23 19:18:09 2023 -0800
faUniqify to avoid duplicates in fasta.  Use different column for more complete Nextstrain clade labels.

diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh
index c7c187f..c06af03 100755
--- src/hg/utils/otto/sarscov2phylo/getCncb.sh
+++ src/hg/utils/otto/sarscov2phylo/getCncb.sh
@@ -75,48 +75,49 @@
 | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \
     > nonGenBankIds
 set +o pipefail
 grep ^C_ missingIDs \
 | grep -Fwf - nonGenBankIds \
 | cat \
     > missingGenBaseIds
 set -o pipefail
 for acc in $(cat missingGenBaseIds); do
     curlRetry "$genBaseFileApiBase$acc" \
     | sed -re '/^>/  s/ .*//;'
     sleep 5
 done > new.accs.fa
 
 cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \
+| faUniqify stdin stdout \
 | xz -T 20 > cncb.nonGenBank.acc.fasta.new.xz
 mv cncb.nonGenBank.acc.fasta.new.xz cncb.nonGenBank.acc.fasta.xz
 
 xzcat cncb.nonGenBank.acc.fasta.xz \
 | faSomeRecords stdin <(cut -f 1 accToNameBarAcc.tsv) stdout \
 | faRenameRecords stdin accToNameBarAcc.tsv stdout \
 | xz -T 20 > cncb.nonGenBank.fasta.xz
 
 # Run nextclade
 cp ../cncb.latest/nextclade.full.tsv.gz .
 cp ../cncb.latest/nextclade.tsv .
 if [ -s new.accs.fa ]; then
     nDataDir=~angie/github/nextclade/data/sars-cov-2
     time nextclade run -j 20 new.accs.fa \
         --input-dataset $nDataDir \
         --output-fasta nextalign.new.fa.xz \
         --output-tsv nextclade.new.full.tsv.gz  >& nextclade.log
-    zcat nextclade.new.full.tsv.gz | cut -f 1,2 | tail -n+2 >> nextclade.tsv
+    zcat nextclade.new.full.tsv.gz | cut -f 1,7 | tail -n+2 >> nextclade.tsv
     sort -u nextclade.tsv > tmp
     mv tmp nextclade.tsv
     cat nextclade.new.full.tsv.gz >> nextclade.full.tsv.gz
 fi
 
 # Run pangolin
 cp ../cncb.latest/pangolin.tsv .
 if [ -s new.accs.fa ]; then
     set +x
     . ~/.bashrc
     conda activate pangolin
     set -x
     set -e
     time pangolin -t 20 new.accs.fa --skip-scorpio --outfile lineages.csv \
         >& pangolin.log