9fb2b35720b02cca3a9cece5cc8cdb782f86ed34 angie Tue Oct 31 09:56:35 2023 -0700 Compress some previously uncompressed files so we don't waste quite so much disk space. Also spare sequences listed in sars-cov-2-variants/lineage-proposals repo file recombinants.tsv from the nextclade-based filters. diff --git src/hg/utils/otto/sarscov2phylo/getCogUk.sh src/hg/utils/otto/sarscov2phylo/getCogUk.sh index 8f1636e..be1c1af 100755 --- src/hg/utils/otto/sarscov2phylo/getCogUk.sh +++ src/hg/utils/otto/sarscov2phylo/getCogUk.sh @@ -30,37 +30,37 @@ if curl -S -s -C - -O $url; then break else echo "FAILED; will try again after $retryDelay seconds" sleep $retryDelay fi done if [[ $attempt -gt $maxAttempts ]]; then echo "curl failed $maxAttempts times; quitting." exit 1 fi } curlRetry $cogUrlBase/cog_all.fasta.gz curlRetry $cogUrlBase/cog_metadata.csv.gz -gunzip -f cog_metadata.csv.gz curlRetry $cogUrlBase/cog_global_tree.newick zcat cog_all.fasta.gz | xz -T 20 > cog_all.fasta.xz rm cog_all.fasta.gz -tail -n +2 cog_metadata.csv \ +zcat cog_metadata.csv.gz \ +| tail -n +2 \ | awk -F, '{print $1 "\t" $5;}' | sort > cogUkToDate # Reuse nextclade assignments for older sequences; compute nextclade assignments for new seqs. zcat $ottoDir/cogUk.latest/nextclade.full.tsv.gz > nextclade.full.tsv cp $ottoDir/cogUk.latest/nextalign.fa.xz . comm -13 <(cut -f 1 nextclade.full.tsv | sort) <(fastaNames cog_all.fasta.xz | sort) \ > seqsForNextclade if [ -s seqsForNextclade ]; then nDataDir=~angie/github/nextclade/data/sars-cov-2 outTsv=$(mktemp) outFa=$(mktemp) faSomeRecords <(xzcat cog_all.fasta.xz) seqsForNextclade stdout \ | nextclade run -j 30 \ --input-dataset $nDataDir \ --output-fasta $outFa \