9fb2b35720b02cca3a9cece5cc8cdb782f86ed34
angie
  Tue Oct 31 09:56:35 2023 -0700
Compress some previously uncompressed files so we don't waste quite so much disk space.  Also spare sequences listed in sars-cov-2-variants/lineage-proposals repo file recombinants.tsv from the nextclade-based filters.

diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh
index 56689ec..c7c187f 100755
--- src/hg/utils/otto/sarscov2phylo/getCncb.sh
+++ src/hg/utils/otto/sarscov2phylo/getCncb.sh
@@ -71,40 +71,41 @@
 # Use the GenBase API to download missing GenBase sequences, unfortunately only one sequence
 # at a time.
 tail -n+2 cncb.metadata.tsv \
 | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \
 | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \
     > nonGenBankIds
 set +o pipefail
 grep ^C_ missingIDs \
 | grep -Fwf - nonGenBankIds \
 | cat \
     > missingGenBaseIds
 set -o pipefail
 for acc in $(cat missingGenBaseIds); do
     curlRetry "$genBaseFileApiBase$acc" \
     | sed -re '/^>/  s/ .*//;'
-    sleep 10
+    sleep 5
 done > new.accs.fa
 
 cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \
 | xz -T 20 > cncb.nonGenBank.acc.fasta.new.xz
 mv cncb.nonGenBank.acc.fasta.new.xz cncb.nonGenBank.acc.fasta.xz
 
 xzcat cncb.nonGenBank.acc.fasta.xz \
 | faSomeRecords stdin <(cut -f 1 accToNameBarAcc.tsv) stdout \
-| faRenameRecords stdin accToNameBarAcc.tsv cncb.nonGenBank.fasta
+| faRenameRecords stdin accToNameBarAcc.tsv stdout \
+| xz -T 20 > cncb.nonGenBank.fasta.xz
 
 # Run nextclade
 cp ../cncb.latest/nextclade.full.tsv.gz .
 cp ../cncb.latest/nextclade.tsv .
 if [ -s new.accs.fa ]; then
     nDataDir=~angie/github/nextclade/data/sars-cov-2
     time nextclade run -j 20 new.accs.fa \
         --input-dataset $nDataDir \
         --output-fasta nextalign.new.fa.xz \
         --output-tsv nextclade.new.full.tsv.gz  >& nextclade.log
     zcat nextclade.new.full.tsv.gz | cut -f 1,2 | tail -n+2 >> nextclade.tsv
     sort -u nextclade.tsv > tmp
     mv tmp nextclade.tsv
     cat nextclade.new.full.tsv.gz >> nextclade.full.tsv.gz
 fi