72739f248f43d3685eb2ca5fd464f73102fb77a4 angie Thu Jan 13 11:36:29 2022 -0800 Increase split fasta size for nextclade run - more efficient. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index 17b617b..79e9964 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -72,31 +72,31 @@ # Run pangolin and nextclade on sequences that are new since yesterday export TMPDIR=/dev/shm fastaNames genbank.fa.xz | awk '{print $1;}' | sed -re 's/\|.*//' | grep -vx pdb | sort -u > gb.names splitDir=splitForNextclade rm -rf $splitDir mkdir $splitDir if [ -e ../ncbi.latest/nextclade.tsv ]; then cp ../ncbi.latest/nextclade.tsv . cut -f 1 nextclade.tsv | sort -u > nextclade.prev.names comm -23 gb.names nextclade.prev.names > nextclade.names faSomeRecords <(xzcat genbank.fa.xz) nextclade.names nextclade.fa faSplit about nextclade.fa 30000000 $splitDir/chunk else cp /dev/null nextclade.tsv - faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk + faSplit about <(xzcat genbank.fa.xz) 300000000 $splitDir/chunk fi if (( $(ls -1 splitForNextclade | wc -l) > 0 )); then nDataDir=~angie/github/nextclade/data/sars-cov-2 outDir=$(mktemp -d) outTsv=$(mktemp) for chunkFa in $splitDir/chunk*.fa; do nextclade -j 50 -i $chunkFa \ --input-root-seq $nDataDir/reference.fasta \ --input-tree $nDataDir/tree.json \ --input-qc-config $nDataDir/qc.json \ --output-dir $outDir \ --output-tsv $outTsv >& nextclade.log cut -f 1,2 $outTsv | tail -n+2 | sed -re 's/"//g;' >> nextclade.tsv rm $outTsv done