5f1c3399aaa5c1bcd341470a156cb528073546d3 angie Thu Jan 13 11:39:32 2022 -0800 Uniqueify to prevent dups from several possible sources. diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh index b29043f..235aa79 100755 --- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh +++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh @@ -21,54 +21,54 @@ cd /hive/users/angie/gisaid # Glom all the chunks together. # Remove initial "hCoV-19/" and remove spaces a la nextmeta (e.g. "Hong Kong" -> "HongKong"). # Strip single quotes (e.g. "Cote d'Ivoire" --> "CotedIvoire"). # Also remove a stray comma in a name that caused Newick parsing error ("Hungary/US-32533w,/2020"). # Keep the strain|epiId|date "full names". time xzcat chunks/gisaid_epi_isl_*.fa.xz \ | sed -re 's@^>hCo[Vv]-19/+@>@; s/[ '"'"',()]//g; s/\r$//;' \ | xz -T 8 \ > gisaid_fullNames_$today.fa.xz # Make tmp files with a fullName key and various columns that we'll join together. fastaNames gisaid_fullNames_$today.fa.xz \ | awk -F\| -vOFS="\t" '{print $0, $1, $2, $3;}' \ -| sort \ +| sort -u \ > tmp.first3 # Sequence length -faSize -detailed <(xzcat gisaid_fullNames_$today.fa.xz) | sort > tmp.lengths +faSize -detailed <(xzcat gisaid_fullNames_$today.fa.xz) | sort -u > tmp.lengths # Lineage & clade assignments -sort chunks/pangolin.tsv \ +sort -u chunks/pangolin.tsv \ > tmp.lineage -sort chunks/nextclade.tsv \ +sort -u chunks/nextclade.tsv \ > tmp.clade # Countries -- go back to unstripped sequence names: xzcat chunks/gisaid_epi_isl_*.fa.xz \ | grep ^\> \ | sed -re 's@^>hCo[Vv]-19/+@@;' \ | $scriptDir/gisaidNameToCountry.pl \ -| sort \ +| sort -u \ > tmp.country # Join locally computed fields and sort by EPI ID for joining with latest real nextmeta join -t$'\t' -a 1 tmp.first3 tmp.lengths \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,2.2 - tmp.clade \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,2.2 - tmp.lineage \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - tmp.country \ | tawk '{print $3, $2, $4, $5, $6, $7, $8;}' \ -| sort \ +| sort -u \ > tmp.epiToLocalMeta # Join with latest real nextmeta and put locally computed fields in nextmeta column positions. # Last real nextmeta has 27 columns. These are the columns we can fill in: #1 strain #3 gisaid_epi_isl #4 genbank_accession # fold in later, after updating mapping #5 date #7 country #14 length #18 Nextstrain_clade #19 pangolin_lineage # Fill in other columns from nextmeta when possible (join on EPI ID since names change over time) set +o pipefail zcat $lastRealNextmeta | head -1 \ > metadata_batch_$today.tsv