9fb2b35720b02cca3a9cece5cc8cdb782f86ed34 angie Tue Oct 31 09:56:35 2023 -0700 Compress some previously uncompressed files so we don't waste quite so much disk space. Also spare sequences listed in sars-cov-2-variants/lineage-proposals repo file recombinants.tsv from the nextclade-based filters. diff --git src/hg/utils/otto/sarscov2phylo/getCncb.sh src/hg/utils/otto/sarscov2phylo/getCncb.sh index 56689ec..c7c187f 100755 --- src/hg/utils/otto/sarscov2phylo/getCncb.sh +++ src/hg/utils/otto/sarscov2phylo/getCncb.sh @@ -71,40 +71,41 @@ # Use the GenBase API to download missing GenBase sequences, unfortunately only one sequence # at a time. tail -n+2 cncb.metadata.tsv \ | grep -vE '[,'$'\t''][A-Z]{2}[0-9]{6}\.[0-9]+['$'\t'',]' \ | tawk '{ if ($4 == "null") { $4 = ""; } if ($2 ~ /^EPI_ISL/) { tmp = $2; $2 = $4; $4 = tmp; } print $2; }' \ > nonGenBankIds set +o pipefail grep ^C_ missingIDs \ | grep -Fwf - nonGenBankIds \ | cat \ > missingGenBaseIds set -o pipefail for acc in $(cat missingGenBaseIds); do curlRetry "$genBaseFileApiBase$acc" \ | sed -re '/^>/ s/ .*//;' - sleep 10 + sleep 5 done > new.accs.fa cat <(xzcat ../cncb.latest/cncb.nonGenBank.acc.fasta.xz) new.accs.fa \ | xz -T 20 > cncb.nonGenBank.acc.fasta.new.xz mv cncb.nonGenBank.acc.fasta.new.xz cncb.nonGenBank.acc.fasta.xz xzcat cncb.nonGenBank.acc.fasta.xz \ | faSomeRecords stdin <(cut -f 1 accToNameBarAcc.tsv) stdout \ -| faRenameRecords stdin accToNameBarAcc.tsv cncb.nonGenBank.fasta +| faRenameRecords stdin accToNameBarAcc.tsv stdout \ +| xz -T 20 > cncb.nonGenBank.fasta.xz # Run nextclade cp ../cncb.latest/nextclade.full.tsv.gz . cp ../cncb.latest/nextclade.tsv . if [ -s new.accs.fa ]; then nDataDir=~angie/github/nextclade/data/sars-cov-2 time nextclade run -j 20 new.accs.fa \ --input-dataset $nDataDir \ --output-fasta nextalign.new.fa.xz \ --output-tsv nextclade.new.full.tsv.gz >& nextclade.log zcat nextclade.new.full.tsv.gz | cut -f 1,2 | tail -n+2 >> nextclade.tsv sort -u nextclade.tsv > tmp mv tmp nextclade.tsv cat nextclade.new.full.tsv.gz >> nextclade.full.tsv.gz fi