9fb2b35720b02cca3a9cece5cc8cdb782f86ed34
angie
  Tue Oct 31 09:56:35 2023 -0700
Compress some previously uncompressed files so we don't waste quite so much disk space.  Also spare sequences listed in sars-cov-2-variants/lineage-proposals repo file recombinants.tsv from the nextclade-based filters.

diff --git src/hg/utils/otto/sarscov2phylo/getCogUk.sh src/hg/utils/otto/sarscov2phylo/getCogUk.sh
index 8f1636e..be1c1af 100755
--- src/hg/utils/otto/sarscov2phylo/getCogUk.sh
+++ src/hg/utils/otto/sarscov2phylo/getCogUk.sh
@@ -30,37 +30,37 @@
         if curl -S -s -C - -O $url; then
             break
         else
             echo "FAILED; will try again after $retryDelay seconds"
             sleep $retryDelay
         fi
     done
     if [[ $attempt -gt $maxAttempts ]]; then
         echo "curl failed $maxAttempts times; quitting."
         exit 1
     fi
 }
 
 curlRetry $cogUrlBase/cog_all.fasta.gz
 curlRetry $cogUrlBase/cog_metadata.csv.gz
-gunzip -f cog_metadata.csv.gz
 curlRetry $cogUrlBase/cog_global_tree.newick
 
 zcat cog_all.fasta.gz | xz -T 20 > cog_all.fasta.xz
 rm cog_all.fasta.gz
 
-tail -n +2 cog_metadata.csv \
+zcat cog_metadata.csv.gz \
+| tail -n +2 \
 | awk -F, '{print $1 "\t" $5;}' | sort > cogUkToDate
 
 # Reuse nextclade assignments for older sequences; compute nextclade assignments for new seqs.
 zcat $ottoDir/cogUk.latest/nextclade.full.tsv.gz > nextclade.full.tsv
 cp $ottoDir/cogUk.latest/nextalign.fa.xz .
 comm -13 <(cut -f 1 nextclade.full.tsv | sort) <(fastaNames cog_all.fasta.xz | sort) \
     > seqsForNextclade
 if [ -s seqsForNextclade ]; then
     nDataDir=~angie/github/nextclade/data/sars-cov-2
     outTsv=$(mktemp)
     outFa=$(mktemp)
     faSomeRecords <(xzcat cog_all.fasta.xz) seqsForNextclade stdout \
     | nextclade run -j 30 \
         --input-dataset $nDataDir \
         --output-fasta $outFa \