1e6e36834dceedd5c2cb79834bcc84ee0bac4be5 angie Mon Jun 28 08:44:36 2021 -0700 Use make -j for pangolin (no multithreading at this point). diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh index 1bd7e1a..ffe1eb9 100755 --- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh +++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh @@ -1,31 +1,32 @@ #!/bin/bash source ~/.bashrc set -beEu -o pipefail # Do not modify this script, modify the source tree copy: # kent/src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh # Make nextfasta and nextmeta substitute files from chunks of downloaded GISAID sequences lastRealNextmeta=metadata_2020-12-08_20-35.tsv.gz today=$(date +%F) # Run pangolin and nextclade on any chunks that need it cd /hive/users/angie/gisaid/chunks -make +make nextclade.tsv +make -j10 cd /hive/users/angie/gisaid # Glom all the chunks together. # Remove initial "hCoV-19/" and remove spaces a la nextmeta (e.g. "Hong Kong" -> "HongKong"). # Strip single quotes (e.g. "Cote d'Ivoire" --> "CotedIvoire"). # Also remove a stray comma in a name that caused Newick parsing error ("Hungary/US-32533w,/2020"). # Keep the strain|epiId|date "full names". time xzcat chunks/gisaid_epi_isl_*.fa.xz \ | sed -re 's@^>hCo[Vv]-19/+@>@; s/[ '"'"',()]//g; s/\r$//;' \ | xz -T 50 \ > gisaid_fullNames_$today.fa.xz # Make tmp files with a fullName key and various columns that we'll join together. fastaNames gisaid_fullNames_$today.fa.xz \ | awk -F\| -vOFS="\t" '{print $0, $1, $2, $3;}' \