f2690b5f40e7e33dc355782853e812c76f4736ea
angie
  Fri Dec 4 09:11:41 2020 -0800
Add minimal metadata file for public sequences for use by hgPhyloPlace.

diff --git src/hg/utils/otto/sarscov2phylo/mapPublic.sh src/hg/utils/otto/sarscov2phylo/mapPublic.sh
index aa43d05..67def73 100755
--- src/hg/utils/otto/sarscov2phylo/mapPublic.sh
+++ src/hg/utils/otto/sarscov2phylo/mapPublic.sh
@@ -89,15 +89,37 @@
     > public-$releaseLabel.minAf.01.vcf
 wc -l public-$releaseLabel.minAf.01.vcf
 bgzip -f public-$releaseLabel.minAf.01.vcf
 tabix -p vcf public-$releaseLabel.minAf.01.vcf.gz
 
 # And the lineage colors:
 # TODO: add Nextstrain clade colors
 for source in gisaid lineage nextstrain; do
     Source=${source^}
     zcat ${source}Colors.gz \
     | subColumn -miss=/dev/null 1 stdin treeToPublic stdout \
     | grep -v EPI_ISL > public${Source}Colors
     wc -l public${Source}Colors
     gzip -f public${Source}Colors
 done
+
+# Metadata for hgPhyloPlace:
+# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them:
+echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tpangolin_lineage" \
+    > public-$releaseLabel.metadata.tsv
+# NCBI metadata (strip colon-separated location after country if present):
+tawk '$8 >= 29000 { print $6, $1, $3, $4, $5, "", $8, ""; }' \
+    /hive/data/outside/otto/sarscov2phylo/ncbi.latest/ncbi_dataset.plusBioSample.tsv \
+| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \
+| sed -re 's@SARS-Co[Vv]-2/human/@@; s@SARS-Co[Vv]-2/@@; s@hCo[Vv]-19/@@;' \
+    >> public-$releaseLabel.metadata.tsv
+# COG-UK metadata:
+tail -n+2 /hive/data/outside/otto/sarscov2phylo/cogUk.latest/cog_metadata.csv \
+| awk -F, -v 'OFS=\t' '{print $1, "", $4, $2, "", "", "", $6; }' \
+    >> public-$releaseLabel.metadata.tsv
+# CNCB metadata:
+tail -n+2 /hive/data/outside/otto/sarscov2phylo/cncb.latest/cncb.metadata.tsv \
+| tawk '$3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank" { print $1, "", $10, $11, $9, $5, $6, ""; }' \
+| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@;  s@BetaCo[Vv]/@@;' \
+| sed -re 's@SARS-Co[Vv]-2/human/@@; s@SARS-Co[Vv]-2/@@; s@hCo[Vv]-19/@@;' \
+    >> public-$releaseLabel.metadata.tsv
+gzip -f public-$releaseLabel.metadata.tsv