71a5bf4d42948792ba96729c01bec373eed86918 angie Sun Feb 4 09:16:38 2024 -0800 Use nextclade v3 output column indices. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 6339a09..9cbdbea 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -81,31 +81,31 @@ # Replace FASTA headers with reconstructed names from enhanced metadata. time cleanGenbank < ncbi_dataset/data/genomic.fna \ | $scriptDir/fixNcbiFastaNames.pl ncbi_dataset.plusBioSample.tsv \ > genbank.maybeDups.fa time fastaNames genbank.maybeDups.fa | awk '{print $1 "\t" $0;}' > gb.rename time faUniqify genbank.maybeDups.fa stdout \ | faRenameRecords stdin gb.rename stdout \ | xz -T 20 \ > genbank.fa.xz # Run pangolin and nextclade on sequences that are new since yesterday export TMPDIR=/dev/shm fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names zcat ../ncbi.latest/nextclade.full.tsv.gz > nextclade.full.tsv cp ../ncbi.latest/nextalign.fa.xz . -comm -23 gb.names <(cut -f 1 nextclade.full.tsv | sort -u) > nextclade.names +comm -23 gb.names <(cut -f 2 nextclade.full.tsv | sort -u) > nextclade.names if [ -s nextclade.names ]; then nDataDir=~angie/github/nextclade/data/sars-cov-2 outTsv=$(mktemp) outFa=$(mktemp) faSomeRecords <(xzcat genbank.fa.xz) nextclade.names stdout \ | sed -re 's/^>([A-Z]{2}[0-9]{6}\.[0-9])[ |].*/>\1/;' \ | nextclade run -j 50 \ --input-dataset $nDataDir \ --output-fasta $outFa \ --output-tsv $outTsv >& nextclade.log tail -n+2 $outTsv | sed -re 's/"//g;' >> nextclade.full.tsv xz -T 20 < $outFa >> nextalign.fa.xz rm $outTsv $outFa fi wc -l nextclade.full.tsv @@ -118,31 +118,31 @@ fa=$1 out=$fa.pangolin.csv logfile=$(mktemp) pangolin -t 4 $fa --skip-scorpio --outfile $out > $logfile 2>&1 rm $logfile } export -f runPangolin if [ -e ../ncbi.latest/lineage_report.csv ]; then cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names comm -23 gb.names pangolin.prev.names > pangolin.names faSomeRecords <(xzcat genbank.fa.xz) pangolin.names stdout \ | sed -re '/^>/ s/^>([A-Z]{2}[0-9]{6}\.[0-9]+) \|[^,]+/>\1/;' \ > pangolin.fa - pangolin --skip-scorpio pangolin.fa >& pangolin.log + pangolin -t 20 --skip-scorpio pangolin.fa >& pangolin.log tail -n+2 lineage_report.csv >> linRepYesterday mv linRepYesterday lineage_report.csv rm -f pangolin.fa else splitDir=splitForPangolin rm -rf $splitDir mkdir $splitDir faSplit about <(xzcat genbank.fa.xz | sed -re '/^>/ s/ .*//;') 30000000 $splitDir/chunk find $splitDir -name chunk\*.fa \ | parallel -j 10 "runPangolin {}" head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv for f in $splitDir/chunk*.csv; do tail -n+2 $f >> lineage_report.csv done rm -rf $splitDir