2c3a1d65d38fbd4b75eb55b3f8e854a452254459 angie Tue Nov 2 12:58:07 2021 -0700 Crank down on duplicate lines of metadata (reported by Theo Sanderson IIRC). diff --git src/hg/utils/otto/sarscov2phylo/combineMetadata.sh src/hg/utils/otto/sarscov2phylo/combineMetadata.sh index 7983371..87265c3 100755 --- src/hg/utils/otto/sarscov2phylo/combineMetadata.sh +++ src/hg/utils/otto/sarscov2phylo/combineMetadata.sh @@ -45,52 +45,53 @@ | sed -re 's@COG-UK/@@g; s/United Kingdom://g; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s@Northern Ireland/@NorthernIreland/@;' \ > tmp # NCBI metadata for non-COG-UK (strip colon-separated location after country if present): grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \ | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \ | sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \ | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \ | cleanGenbank \ | sort tmp - > gb.metadata if [ -e $ncbiDir/lineage_report.csv ]; then echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv tail -n+2 $ncbiDir/lineage_report.csv \ | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \ | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \ - | sort \ + | sort -u \ > gbToLineage else echo Getting GenBank Pangolin lineages from $prevMeta zcat $prevMeta \ | tail -n+2 \ | tawk '$2 != "" && $8 != "" { print $2, $8; }' \ - | sort \ + | sort -u \ > gbToLineage fi wc -l gbToLineage if [ -e $ncbiDir/nextclade.tsv ]; then - sort $ncbiDir/nextclade.tsv > gbToNextclade + sort -u $ncbiDir/nextclade.tsv > gbToNextclade else touch gbToNextclade fi wc -l gbToNextclade join -t$'\t' -a 1 gb.metadata gbToNextclade \ | join -t$'\t' -a 1 - gbToLineage \ | tawk '{ if ($2 == "") { $2 = "?"; } print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ +| uniq \ >> gisaidAndPublic.$today.metadata.tsv # COG-UK metadata: if [ -e $cogUkDir/nextclade.tsv ]; then sort $cogUkDir/nextclade.tsv > cogUkToNextclade else touch cogUkToNextclade fi #*** Could also add sequence length to metadata from faSizes output... tail -n+2 $cogUkDir/cog_metadata.csv \ | awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \ | sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \ | sort \ | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \ | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \ >> gisaidAndPublic.$today.metadata.tsv