9875f6c8af94b0f0fcdc5e496515438e6af478b5 angie Mon May 24 09:59:55 2021 -0700 Add back -x since there have been so many cron failures. Parallelize re-running of pangolin on all sequences (necessary when a new version of pangolin is released). diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index b7186aa..1e755ce 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -1,18 +1,18 @@ #!/bin/bash source ~/.bashrc -set -beEu -o pipefail +set -beEu -o -x pipefail # Download SARS-CoV-2 GenBank FASTA and metadata using NCBI Datasets API. # Use E-Utils to get SARS-CoV-2 metadata from BioSample. # Use BioSample metadata to fill in gaps in GenBank metadata and report conflicting dates. # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) prevDate=$(date -d yesterday +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo mkdir -p $ottoDir/ncbi.$today @@ -97,37 +97,56 @@ if [ -e ../ncbi.$prevDate/nextclade.tsv ]; then cp ../ncbi.$prevDate/nextclade.tsv . cut -f 1 ../ncbi.$prevDate/nextclade.tsv | sort > nextclade.$prevDate.names comm -23 gb.names nextclade.$prevDate.names > nextclade.names faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa faSplit about nextclade.fa 30000000 $splitDir/chunk else cp /dev/null nextclade.tsv faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk fi for chunkFa in $splitDir/chunk*.fa; do nextclade -j 50 -i $chunkFa -t >(cut -f 1,2 | tail -n+2 >> nextclade.tsv) >& nextclade.log done wc -l nextclade.tsv rm -rf $splitDir nextclade.fa + conda activate pangolin +runPangolin() { + fa=$1 + out=$fa.pangolin.csv + logfile=$(mktemp) + pangolin $fa --outfile $out > $logfile 2>&1 + rm $logfile +} +export -f runPangolin if [ -e ../ncbi.$prevDate/lineage_report.csv ]; then cp ../ncbi.$prevDate/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.$prevDate.names comm -23 gb.names pangolin.$prevDate.names > pangolin.names faSomeRecords <(xzcat genbank.fa.xz) pangolin.names pangolin.fa pangolin pangolin.fa >& pangolin.log tail -n+2 lineage_report.csv >> linRepYesterday mv linRepYesterday lineage_report.csv + rm -f pangolin.fa else - pangolin <(xzcat genbank.fa.xz) >& pangolin.log + splitDir=splitForPangolin + rm -rf $splitDir + mkdir $splitDir + faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk + find $splitDir -name chunk\*.fa \ + | parallel -j 10 "runPangolin {}" + head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv + for f in $splitDir/chunk*.csv; do + tail -n+2 $f >> lineage_report.csv + done + rm -rf $splitDir fi wc -l lineage_report.csv -rm -f pangolin.fa rm -f $ottoDir/ncbi.latest ln -s ncbi.$today $ottoDir/ncbi.latest # Clean up rm -r ncbi_dataset nice xz all.bioSample.* &