9fb2b35720b02cca3a9cece5cc8cdb782f86ed34 angie Tue Oct 31 09:56:35 2023 -0700 Compress some previously uncompressed files so we don't waste quite so much disk space. Also spare sequences listed in sars-cov-2-variants/lineage-proposals repo file recombinants.tsv from the nextclade-based filters. diff --git src/hg/utils/otto/sarscov2phylo/combineMetadata.sh src/hg/utils/otto/sarscov2phylo/combineMetadata.sh index 56f5862..1d35774 100755 --- src/hg/utils/otto/sarscov2phylo/combineMetadata.sh +++ src/hg/utils/otto/sarscov2phylo/combineMetadata.sh @@ -76,31 +76,32 @@ wc -l gbToNextclade join -t$'\t' -a 1 gb.metadata gbToNextclade \ | join -t$'\t' -a 1 - gbToLineage \ | tawk '{ if ($2 == "") { $2 = "?"; } print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ | uniq \ >> gisaidAndPublic.$today.metadata.tsv # COG-UK metadata: if [ -e $cogUkDir/nextclade.full.tsv.gz ]; then zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,2 | sed -re 's/"//g' | sort -u > cogUkToNextclade else touch cogUkToNextclade fi #*** Could also add sequence length to metadata from faSizes output... -tail -n+2 $cogUkDir/cog_metadata.csv \ +zcat $cogUkDir/cog_metadata.csv.gz \ +| tail -n+2 \ | awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \ | sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv # CNCB metadata: tail -n+2 $cncbDir/cncb.metadata.tsv \ | tawk '{ if ($3 != "GISAID" && $3 != "GenBank" && $3 != "Genbank") { print $2, "", $10, $11, $9, $5, $6} }' \ | sed -re 's@\t([A-Za-z -]+)( / [A-Za-z -'"'"']+)+\t@\t\1\t@; s/Sapiens/sapiens/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 - $cncbDir/nextclade.tsv \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,1.8,2.2 - $cncbDir/pangolin.tsv \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \