2c4255b30f6c2aa11ce8c86396eaed8975d52a36 angie Thu Jul 1 10:55:43 2021 -0700 Don't run nextclade when there are no new sequences; it will fail & cause script to halt. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 385deb7..5897b6c 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -82,59 +82,62 @@ # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv # Make a file for joining collection date with ID: tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \ | sort > gbToDate # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ | xz -T 50 \ > genbank.fa.xz # Run pangolin and nextclade on sequences that are new since yesterday +export TMPDIR=/dev/shm fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | sort > gb.names splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir if [ -e ../ncbi.latest/nextclade.tsv ]; then cp ../ncbi.latest/nextclade.tsv . cut -f 1 ../ncbi.latest/nextclade.tsv | sort > nextclade.prev.names comm -23 gb.names nextclade.prev.names > nextclade.names faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa faSplit about nextclade.fa 30000000 $splitDir/chunk else cp /dev/null nextclade.tsv faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk fi +if (( $(ls -1 splitForNextclade | wc -l) > 0 )); then nDataDir=~angie/github/nextclade/data/sars-cov-2 outDir=$(mktemp -d) outTsv=$(mktemp) for chunkFa in $splitDir/chunk*.fa; do nextclade -j 50 -i $chunkFa \ --input-root-seq $nDataDir/reference.fasta \ --input-tree $nDataDir/tree.json \ --input-qc-config $nDataDir/qc.json \ --output-dir $outDir \ --output-tsv $outTsv >& nextclade.log cut -f 1,2 $outTsv | tail -n+2 >> nextclade.tsv rm $outTsv done -wc -l nextclade.tsv rm -rf $outDir +fi +wc -l nextclade.tsv rm -rf $splitDir nextclade.fa conda activate pangolin runPangolin() { fa=$1 out=$fa.pangolin.csv logfile=$(mktemp) pangolin $fa --outfile $out > $logfile 2>&1 rm $logfile } export -f runPangolin if [ -e ../ncbi.latest/lineage_report.csv ]; then cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names