f9ee0c7ea6540636411b7ff4a6ed54ab574701e2 angie Tue Jun 3 11:04:31 2025 -0700 Add Clade I. diff --git src/hg/utils/otto/mpxv/getNcbiMpxv.sh src/hg/utils/otto/mpxv/getNcbiMpxv.sh index 73708693cec..410d6768e93 100755 --- src/hg/utils/otto/mpxv/getNcbiMpxv.sh +++ src/hg/utils/otto/mpxv/getNcbiMpxv.sh @@ -49,31 +49,31 @@ echo "metadata query failed $maxAttempts times; quitting." exit 1 fi wc -l metadata.tsv if [[ ! -s metadata.tsv ]]; then echo "metadata query appeared to succeed but gave 0-length output" exit 1 fi attempt=0 maxAttempts=5 retryDelay=300 while [[ $((++attempt)) -le $maxAttempts ]]; do echo "fasta attempt $attempt" - if datasets download virus genome taxon $taxId --include genome,biosample; then + if datasets download virus genome taxon $taxId --include genome,biosample --no-progressbar; then break; else echo "FAILED fasta; will try again after $retryDelay seconds" rm -f ncbi_dataset.zip sleep $retryDelay # Double the delay to give NCBI progressively more time retryDelay=$(($retryDelay * 2)) fi done if [[ ! -s ncbi_dataset.zip ]]; then echo "fasta query failed $maxAttempts times; quitting." exit 1 fi unzip ncbi_dataset.zip faFilter -minSize=$minSize ncbi_dataset/data/genomic.fna stdout \ @@ -90,26 +90,56 @@ else if ($1 ~ /^MT903342/) { $5 = "2019-04-30"; } else if ($1 ~ /^MT903342/) { $5 = "2019-05"; } else if ($1 ~ /^MT903343/) { $5 = "2018-09"; } else if ($1 ~ /^MT903344/) { $5 = "2018-09"; } else if ($1 ~ /^MT903345/) { $5 = "2018-09"; } print; }' metadata.tsv \ | tawk '$5 ~ /^20(1[7-9]|2)/ && $4 != "Central African Republic" && $4 != "'"Cote d'Ivoire"'" && ($8 == "Homo sapiens" || $8 == "")' \ > metadata.2017outbreak.tsv wc -l metadata.2017outbreak.tsv # Run nextclade on the lot (takes only a few seconds as of 2022-07-25) to get alignments, # QC, clade assignments. +# Clade II 2017 outbreak: nextclade dataset get --name hMPXV --output-zip hMPXV.zip time nextclade run \ -D hMPXV.zip \ -j 30 \ - --output-tsv nextclade.tsv \ - --output-fasta nextalign.fa.xz \ + --output-tsv nextclade.II.tsv.gz \ + --output-fasta nextalign.II.fa.xz \ --retry-reverse-complement true \ genbank.fa.xz +# Clade I: Nextstrain uses a different reference (DQ011155.1 Zaire_1979-005 instead of +# RefSeq NC_003310.1 AF380138.1 Zaire-96-I-16); use the Nextstrain dataset for clade assignments, +# but run nextclade separately with the RefSeq sequence for sequence alignments. +nextclade dataset get --name nextstrain/mpox/clade-i --output-zip mpox_clade-i.zip +nextclade run \ + -D mpox_clade-i.zip \ + -j 30 \ + --output-tsv nextclade.I.tsv.gz \ + --retry-reverse-complement true \ + genbank.fa.xz + +# Use alignment parameters from pathogen.json in mpox_clade-i.zip +nextclade run \ + --input-ref $mpxvDir/GCF_000857045.1.NC_003310.1.fa \ + --excess-bandwidth 100 \ + --terminal-bandwidth 300 \ + --allowed-mismatches 8 \ + --window-size 40 \ + --min-seed-cover 0.1 \ + --output-fasta nextalign.I.fa.xz \ + genbank.fa.xz + +# Extract metadata for only clade I sequences +zcat nextclade.I.tsv.gz \ +| tawk '$3 ~ /^I[^I]/ {print $2;}' \ +| grep -Fwf - metadata.tsv \ + > metadata.cladeI.tsv + + rm -f $mpxvDir/ncbi/ncbi.latest ln -s ncbi.$today $mpxvDir/ncbi/ncbi.latest