afedd6d0dae5767b8ee9041119d72a65360f77e0 angie Wed Oct 4 17:46:28 2023 -0700 Strip stuff past the accession in headers when making fasta for pangolin. diff --git src/hg/utils/otto/sarscov2phylo/getNcbi.sh src/hg/utils/otto/sarscov2phylo/getNcbi.sh index bf672dd..f96a26b 100755 --- src/hg/utils/otto/sarscov2phylo/getNcbi.sh +++ src/hg/utils/otto/sarscov2phylo/getNcbi.sh @@ -122,31 +122,33 @@ conda activate pangolin set -x runPangolin() { fa=$1 out=$fa.pangolin.csv logfile=$(mktemp) pangolin -t 4 $fa --skip-scorpio --outfile $out > $logfile 2>&1 rm $logfile } export -f runPangolin if [ -e ../ncbi.latest/lineage_report.csv ]; then cp ../ncbi.latest/lineage_report.csv linRepYesterday tail -n+2 linRepYesterday | sed -re 's/^([A-Z]+[0-9]+\.[0-9]+).*/\1/' | sort \ > pangolin.prev.names comm -23 gb.names pangolin.prev.names > pangolin.names - faSomeRecords <(xzcat genbank.fa.xz) pangolin.names pangolin.fa + faSomeRecords <(xzcat genbank.fa.xz) pangolin.names stdout \ + | sed -re '/^>/ s/^>([A-Z]{2}[0-9]{6}\.[0-9]+) \|[^,]+/>\1/;' \ + > pangolin.fa pangolin --analysis-mode pangolearn pangolin.fa >& pangolin.log tail -n+2 lineage_report.csv >> linRepYesterday mv linRepYesterday lineage_report.csv rm -f pangolin.fa else splitDir=splitForPangolin rm -rf $splitDir mkdir $splitDir faSplit about <(xzcat genbank.fa.xz) 30000000 $splitDir/chunk find $splitDir -name chunk\*.fa \ | parallel -j 10 "runPangolin {}" head -1 $(ls -1 $splitDir/chunk*.csv | head -1) > lineage_report.csv for f in $splitDir/chunk*.csv; do tail -n+2 $f >> lineage_report.csv done