3c3aea0b7973d9497b04647d7c8ed4ebb0f64f5c angie Fri Dec 20 15:40:01 2024 -0800 Tweak to run one more command in parallel. diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh index b4f3e58..7c0369c 100755 --- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh +++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh @@ -39,38 +39,38 @@ faSize -detailed <(xzcat gisaid_fullNames_$today.fa.xz) | sort -u > tmp.lengths & # Lineage & clade assignments sort -u chunks/pangolin.tsv \ > tmp.lineage & sed -re 's/"//g;' chunks/nextclade.tsv \ | sort -u \ > tmp.clade & # Countries -- go back to unstripped sequence names: xzcat chunks/gisaid_epi_isl_*.fa.xz \ | grep ^\> \ | sed -re 's@^>hCo[Vv]-19/+@@;' \ | $scriptDir/gisaidNameToCountry.pl \ | sort -u \ > tmp.country & -wait - # Make fasta with strain-name headers a la nextfasta. xzcat gisaid_fullNames_$today.fa.xz \ | sed -re '/^>/ s/\|.*//' \ | xz -T 20 \ > sequences_batch_$today.fa.xz & +wait + # Join locally computed fields and sort by EPI ID for joining with latest real nextmeta join -t$'\t' -a 1 tmp.first3 tmp.lengths \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,2.2 - tmp.clade \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2 - tmp.lineage \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - tmp.country \ | tawk '{print $3, $2, $4, $5, $6, $7, $8;}' \ | sort -u \ > tmp.epiToLocalMeta # Join with latest real nextmeta and put locally computed fields in nextmeta column positions. # Last real nextmeta has 27 columns. These are the columns we can fill in: #1 strain #3 gisaid_epi_isl #4 genbank_accession # fold in later, after updating mapping #5 date #7 country