abdea7a81b3d511adaee6300277721a804f5581f angie Tue Mar 30 11:57:47 2021 -0700 Ongoing fixes for odd characters in sequence names and other pipeline tweaks. diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh index 0e382d1..58060e4 100755 --- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh +++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh @@ -9,37 +9,36 @@ lastRealNextmeta=metadata_2020-12-08_20-35.tsv.gz today=$(date +%F) # Run pangolin and nextclade on any chunks that need it cd /hive/users/angie/gisaid/chunks make cd /hive/users/angie/gisaid # Glom all the chunks together. # Remove initial "hCoV-19/" and remove spaces a la nextmeta (e.g. "Hong Kong" -> "HongKong"). # Also remove a stray comma in a name that caused Newick parsing error ("Hungary/US-32533w,/2020"). # Keep the strain|epiId|date "full names". time xzcat chunks/gisaid_epi_isl_*.fa.xz \ -| sed -re 's@^>hCo[Vv]-19/@>@; s/ //g; s/,//;' \ +| sed -re 's@^>hCo[Vv]-19/+@>@; s/ //g; s/,//; s/\r$//;' \ | xz -T 50 \ > gisaid_fullNames_$today.fa.xz # Make tmp files with a fullName key and various columns that we'll join together. fastaNames gisaid_fullNames_$today.fa.xz \ -| sed -e 's/\r$//' \ | awk -F\| -vOFS="\t" '{print $0, $1, $2, $3;}' \ | sort \ > tmp.first3 # Sequence length faSize -detailed <(xzcat gisaid_fullNames_$today.fa.xz) | sort > tmp.lengths # Lineage & clade assignments sort chunks/pangolin.tsv \ > tmp.lineage sort chunks/nextclade.tsv \ > tmp.clade # Join locally computed fields and sort by EPI ID for joining with latest real nextmeta join -t$'\t' -a 1 tmp.first3 tmp.lengths \ | join -t$'\t' -a 1 - tmp.clade \ | join -t$'\t' -a 1 - tmp.lineage \ | tawk '{print $3, $2, $4, $5, $6, $7;}' \