6068fc9cbd3f8aa5f70f0535a5086423cf0c8d4c angie Wed Sep 22 20:42:55 2021 -0700 Move the construction of new.masked.vcf.gz out of updateCombinedTree.sh into makeNewMaskedVcf.sh and move out the metadata stuff into combineMetadata.sh. diff --git src/hg/utils/otto/sarscov2phylo/combineMetadata.sh src/hg/utils/otto/sarscov2phylo/combineMetadata.sh new file mode 100755 index 0000000..7983371 --- /dev/null +++ src/hg/utils/otto/sarscov2phylo/combineMetadata.sh @@ -0,0 +1,113 @@ +#!/bin/bash +set -beEu -x -o pipefail + +# Do not modify this script, modify the source tree copy: +# kent/src/hg/utils/otto/sarscov2phylo/updateCombinedTree.sh + +usage() { + echo "usage: $0 prevDate today" +} + +if [ $# != 2 ]; then + usage + exit 1 +fi + +prevDate=$1 +today=$2 + +ottoDir=/hive/data/outside/otto/sarscov2phylo +ncbiDir=$ottoDir/ncbi.latest +cogUkDir=$ottoDir/cogUk.latest +cncbDir=$ottoDir/cncb.latest +gisaidDir=/hive/users/angie/gisaid +minReal=20000 +scriptDir=$(dirname "${BASH_SOURCE[0]}") +source $scriptDir/util.sh + +mkdir -p $ottoDir/$today +cd $ottoDir/$today + +prevMeta=$ottoDir/$prevDate/public-$prevDate.metadata.tsv.gz + +# Metadata for hgPhyloPlace: +# Header names same as nextmeta (with strain first) so hgPhyloPlace recognizes them: +echo -e "strain\tgenbank_accession\tdate\tcountry\thost\tcompleteness\tlength\tNextstrain_clade\tpangolin_lineage" \ + > gisaidAndPublic.$today.metadata.tsv +# It's not always possible to recreate both old and new names correctly from metadata, +# so make a file to translate accession or COG-UK to the name used in VCF, tree and protobufs. +awk -F\| '{ if ($3 == "") { print $1 "\t" $0; } else { print $2 "\t" $0; } }' samples.$today \ +| sort \ + > idToName +# NCBI metadata for COG-UK: strip COG-UK/ & United Kingdom:, add country & year to name +grep COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '$8 >= '$minReal' {print $1, $3, $4, $5, $4 "/" $6 "/" $3 "|" $1 "|" $3, $8;}' \ +| sed -re 's@COG-UK/@@g; s/United Kingdom://g; s/(\/[0-9]{4})(-[0-9]+)*/\1/; + s@Northern Ireland/@NorthernIreland/@;' \ + > tmp +# NCBI metadata for non-COG-UK (strip colon-separated location after country if present): +grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ +| tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \ +| sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \ +| perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \ +| cleanGenbank \ +| sort tmp - > gb.metadata +if [ -e $ncbiDir/lineage_report.csv ]; then + echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv + tail -n+2 $ncbiDir/lineage_report.csv \ + | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \ + | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \ + | sort \ + > gbToLineage +else + echo Getting GenBank Pangolin lineages from $prevMeta + zcat $prevMeta \ + | tail -n+2 \ + | tawk '$2 != "" && $8 != "" { print $2, $8; }' \ + | sort \ + > gbToLineage +fi +wc -l gbToLineage +if [ -e $ncbiDir/nextclade.tsv ]; then + sort $ncbiDir/nextclade.tsv > gbToNextclade +else + touch gbToNextclade +fi +wc -l gbToNextclade +join -t$'\t' -a 1 gb.metadata gbToNextclade \ +| join -t$'\t' -a 1 - gbToLineage \ +| tawk '{ if ($2 == "") { $2 = "?"; } + print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +# COG-UK metadata: +if [ -e $cogUkDir/nextclade.tsv ]; then + sort $cogUkDir/nextclade.tsv > cogUkToNextclade +else + touch cogUkToNextclade +fi +#*** Could also add sequence length to metadata from faSizes output... +tail -n+2 $cogUkDir/cog_metadata.csv \ +| awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \ +| sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \ +| sort \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +# CNCB metadata: +tail -n+2 $cncbDir/cncb.metadata.tsv \ +| tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") { + print $2, "", $10, $11, $9, $5, $6} }' \ +| sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s/Sapiens/sapiens/;' \ +| sort \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \ +| join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \ +| join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ + >> gisaidAndPublic.$today.metadata.tsv +wc -l gisaidAndPublic.$today.metadata.tsv +zcat $gisaidDir/metadata_batch_$today.tsv.gz \ +| grep -Fwf <(grep EPI_ISL samples.$today | cut -d\| -f 2) \ +| tawk '{print $1 "|" $3 "|" $5, "", $5, $7, $15, $13, $14, $18, $19;}' \ + >> gisaidAndPublic.$today.metadata.tsv +wc -l gisaidAndPublic.$today.metadata.tsv +gzip -f gisaidAndPublic.$today.metadata.tsv