f2690b5f40e7e33dc355782853e812c76f4736ea angie Fri Dec 4 09:11:41 2020 -0800 Add minimal metadata file for public sequences for use by hgPhyloPlace. diff --git src/hg/utils/otto/sarscov2phylo/mapPublic.sh src/hg/utils/otto/sarscov2phylo/mapPublic.sh index aa43d05..67def73 100755 --- src/hg/utils/otto/sarscov2phylo/mapPublic.sh +++ src/hg/utils/otto/sarscov2phylo/mapPublic.sh @@ -89,15 +89,37 @@ > public-$releaseLabel.minAf.01.vcf wc -l public-$releaseLabel.minAf.01.vcf bgzip -f public-$releaseLabel.minAf.01.vcf tabix -p vcf public-$releaseLabel.minAf.01.vcf.gz # And the lineage colors: # TODO: add Nextstrain clade colors for source in gisaid lineage nextstrain; do Source=${source^} zcat ${source}Colors.gz \ | subColumn -miss=/dev/null 1 stdin treeToPublic stdout \ | grep -v EPI_ISL > public${Source}Colors wc -l public${Source}Colors gzip -f public${Source}Colors done + +# Metadata for hgPhyloPlace: +# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: +echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tpangolin_lineage" \ + > public-$releaseLabel.metadata.tsv +# NCBI metadata (strip colon-separated location after country if present): +tawk '$8 >= 29000 { print $6, $1, $3, $4, $5, "", $8, ""; }' \ + /hive/data/outside/otto/sarscov2phylo/ncbi.latest/ncbi_dataset.plusBioSample.tsv \ +| sed -re 's/\t([A-Za-z -]+):[A-Za-z0-9 ,()_-]+\t/\t\1\t/;' \ +| sed -re 's@SARS-Co[Vv]-2/human/@@; s@SARS-Co[Vv]-2/@@; s@hCo[Vv]-19/@@;' \ + >> public-$releaseLabel.metadata.tsv +# COG-UK metadata: +tail -n+2 /hive/data/outside/otto/sarscov2phylo/cogUk.latest/cog_metadata.csv \ +| awk -F, -v 'OFS=\t' '{print $1, "", $4, $2, "", "", "", $6; }' \ + >> public-$releaseLabel.metadata.tsv +# CNCB metadata: +tail -n+2 /hive/data/outside/otto/sarscov2phylo/cncb.latest/cncb.metadata.tsv \ +| tawk '$3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank" { print $1, "", $10, $11, $9, $5, $6, ""; }' \ +| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s@BetaCo[Vv]/@@;' \ +| sed -re 's@SARS-Co[Vv]-2/human/@@; s@SARS-Co[Vv]-2/@@; s@hCo[Vv]-19/@@;' \ + >> public-$releaseLabel.metadata.tsv +gzip -f public-$releaseLabel.metadata.tsv