77d911dafe3d0fefac4aa3f9e3b5982df129815f angie Tue Jul 13 09:43:39 2021 -0700 join -a doesn't include placeholder columns when there is no matching line, so make output columns explicit; otherwise country can end up in clade column. diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh index 93c5982..60267d1 100755 --- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh +++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh @@ -40,33 +40,33 @@ # Lineage & clade assignments sort chunks/pangolin.tsv \ > tmp.lineage sort chunks/nextclade.tsv \ > tmp.clade # Countries -- go back to unstripped sequence names: xzcat chunks/gisaid_epi_isl_*.fa.xz \ | grep ^\> \ | sed -re 's@^>hCo[Vv]-19/+@@;' \ | $scriptDir/gisaidNameToCountry.pl \ | sort \ > tmp.country # Join locally computed fields and sort by EPI ID for joining with latest real nextmeta join -t$'\t' -a 1 tmp.first3 tmp.lengths \ -| join -t$'\t' -a 1 - tmp.clade \ -| join -t$'\t' -a 1 - tmp.lineage \ -| join -t$'\t' -a 1 - tmp.country \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,2.2 - tmp.clade \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2 - tmp.lineage \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - tmp.country \ | tawk '{print $3, $2, $4, $5, $6, $7, $8;}' \ | sort \ > tmp.epiToLocalMeta # Join with latest real nextmeta and put locally computed fields in nextmeta column positions. # Last real nextmeta has 27 columns. These are the columns we can fill in: #1 strain #3 gisaid_epi_isl #4 genbank_accession # fold in later, after updating mapping #5 date #7 country #14 length #18 Nextstrain_clade #19 pangolin_lineage # Fill in other columns from nextmeta when possible (join on EPI ID since names change over time) set +o pipefail