src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh abdea7a81b3d511adaee6300277721a804f5581f

abdea7a81b3d511adaee6300277721a804f5581f
angie
  Tue Mar 30 11:57:47 2021 -0700
Ongoing fixes for odd characters in sequence names and other pipeline tweaks.

diff --git src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh
index 0e382d1..58060e4 100755
--- src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh
+++ src/hg/utils/otto/sarscov2phylo/gisaidFromChunks.sh
@@ -9,37 +9,36 @@
 
 lastRealNextmeta=metadata_2020-12-08_20-35.tsv.gz
 
 today=$(date +%F)
 
 # Run pangolin and nextclade on any chunks that need it
 cd /hive/users/angie/gisaid/chunks
 make
 
 cd /hive/users/angie/gisaid
 # Glom all the chunks together.
 # Remove initial "hCoV-19/" and remove spaces a la nextmeta (e.g. "Hong Kong" -> "HongKong").
 # Also remove a stray comma in a name that caused Newick parsing error ("Hungary/US-32533w,/2020").
 # Keep the strain|epiId|date "full names".
 time xzcat chunks/gisaid_epi_isl_*.fa.xz \
-| sed -re 's@^>hCo[Vv]-19/@>@; s/ //g; s/,//;' \
+| sed -re 's@^>hCo[Vv]-19/+@>@; s/ //g; s/,//;  s/\r$//;' \
 | xz -T 50 \
     > gisaid_fullNames_$today.fa.xz
 
 # Make tmp files with a fullName key and various columns that we'll join together.
 fastaNames gisaid_fullNames_$today.fa.xz \
-| sed -e 's/\r$//' \
 | awk -F\| -vOFS="\t" '{print $0, $1, $2, $3;}' \
 | sort \
     > tmp.first3
 # Sequence length
 faSize -detailed  <(xzcat gisaid_fullNames_$today.fa.xz) | sort > tmp.lengths
 # Lineage & clade assignments
 sort chunks/pangolin.tsv \
     > tmp.lineage
 sort chunks/nextclade.tsv \
     > tmp.clade
 # Join locally computed fields and sort by EPI ID for joining with latest real nextmeta
 join -t$'\t' -a 1 tmp.first3 tmp.lengths \
 | join -t$'\t' -a 1 - tmp.clade \
 | join -t$'\t' -a 1 - tmp.lineage \
 | tawk '{print $3, $2, $4, $5, $6, $7;}' \