b038619ca5e7c1bf6ab5810bfb004dc4c702266b angie Tue Feb 22 11:02:56 2022 -0800 Use latest version of nextclade and keep full output. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 79e9964..542f85f 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -7,30 +7,32 @@ # Use enhanced metadata to rewrite FASTA headers for matching up with sequences in other databases. scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh today=$(date +%F) ottoDir=/hive/data/outside/otto/sarscov2phylo mkdir -p $ottoDir/ncbi.$today cd $ottoDir/ncbi.$today attempt=0 maxAttempts=5 retryDelay=300 +#*** From Eric Cox 1/25/22 when download failed and they were debugging firewall issues: +# --proxy https://www.st-va.ncbi.nlm.nih.gov/datasets/v1 \ while [[ $((++attempt)) -le $maxAttempts ]]; do echo "datasets attempt $attempt" if datasets download virus genome taxon 2697049 \ --exclude-cds \ --exclude-protein \ --exclude-gpff \ --exclude-pdb \ --filename ncbi_dataset.zip \ |& tail -50 \ > datasets.log.$attempt; then break; else echo "FAILED; will try again after $retryDelay seconds" rm -f ncbi_dataset.zip sleep $retryDelay @@ -55,67 +57,63 @@ time $scriptDir/bioSampleJsonToTab.py ncbi_dataset/data/biosample.jsonl > gb.bioSample.tab # Use BioSample metadata to fill in missing pieces of GenBank metadata and report conflicting # sample collection dates: $scriptDir/gbMetadataAddBioSample.pl gb.bioSample.tab ncbi_dataset.tsv \ > ncbi_dataset.plusBioSample.tsv 2>gbMetadataAddBioSample.log # Make a file for joining collection date with ID: tawk '$3 != "" {print $1, $3;}' ncbi_dataset.plusBioSample.tsv \ | sort > gbToDate # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ -| xz -T 8 \ +| xz -T 20 \ > genbank.fa.xz # Run pangolin and nextclade on sequences that are new since yesterday export TMPDIR=/dev/shm fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir -if [ -e ../ncbi.latest/nextclade.tsv ]; then - cp ../ncbi.latest/nextclade.tsv . - cut -f 1 nextclade.tsv | sort -u > nextclade.prev.names - comm -23 gb.names nextclade.prev.names > nextclade.names - faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa - faSplit about nextclade.fa 30000000 $splitDir/chunk -else - cp /dev/null nextclade.tsv - faSplit about <(xzcat genbank.fa.xz) 300000000 $splitDir/chunk -fi -if (( $(ls -1 splitForNextclade | wc -l) > 0 )); then +zcat ../ncbi.latest/nextclade.full.tsv.gz > nextclade.full.tsv +cp ../ncbi.latest/nextalign.fa.xz . +comm -23 gb.names <(cut -f 1 nextclade.full.tsv | sort -u) > nextclade.names +if [ -s nextclade.names ]; then + faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \ + | faSplit about stdin 30000000 $splitDir/chunk nDataDir=~angie/github/nextclade/data/sars-cov-2 outDir=$(mktemp -d) outTsv=$(mktemp) for chunkFa in $splitDir/chunk*.fa; do nextclade -j 50 -i $chunkFa \ - --input-root-seq $nDataDir/reference.fasta \ - --input-tree $nDataDir/tree.json \ - --input-qc-config $nDataDir/qc.json \ + --input-dataset $nDataDir \ --output-dir $outDir \ + --output-basename out \ --output-tsv $outTsv >& nextclade.log - cut -f 1,2 $outTsv | tail -n+2 | sed -re 's/"//g;' >> nextclade.tsv + tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv + xz -T 20 < $outDir/out.aligned.fasta >> nextalign.fa.xz rm $outTsv done rm -rf $outDir fi -wc -l nextclade.tsv -rm -rf $splitDir nextclade.fa +wc -l nextclade.full.tsv +pigz -f -p 8 nextclade.full.tsv +rm -rf $splitDir conda activate pangolin runPangolin() { fa=$1 out=$fa.pangolin.csv logfile=$(mktemp) pangolin $fa --outfile $out > $logfile 2>&1 rm $logfile } export -f runPangolin if [ -e ../ncbi.latest/lineage_report.csv ]; then cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names comm -23 gb.names pangolin.prev.names > pangolin.names