src/hg/utils/otto/sarscov2phylo/combineMetadata.sh 2c3a1d65d38fbd4b75eb55b3f8e854a452254459

2c3a1d65d38fbd4b75eb55b3f8e854a452254459
angie
  Tue Nov 2 12:58:07 2021 -0700
Crank down on duplicate lines of metadata (reported by Theo Sanderson IIRC).

diff --git src/hg/utils/otto/sarscov2phylo/combineMetadata.sh src/hg/utils/otto/sarscov2phylo/combineMetadata.sh
index 7983371..87265c3 100755
--- src/hg/utils/otto/sarscov2phylo/combineMetadata.sh
+++ src/hg/utils/otto/sarscov2phylo/combineMetadata.sh
@@ -45,52 +45,53 @@
 | sed -re 's@COG-UK/@@g; s/United Kingdom://g;  s/(\/[0-9]{4})(-[0-9]+)*/\1/;
            s@Northern Ireland/@NorthernIreland/@;' \
     > tmp
 # NCBI metadata for non-COG-UK (strip colon-separated location after country if present):
 grep -v COG-UK/ $ncbiDir/ncbi_dataset.plusBioSample.tsv \
 | tawk '$8 >= '$minReal' { print $1, $3, $4, $5, $6, $8; }' \
 | sed -re 's@\t([A-Za-z -]+):[^\t]+\t@\t\1\t@;' \
 | perl -wpe '@w = split("\t"); $w[4] =~ s/ /_/g; $_ = join("\t", @w);' \
 | cleanGenbank \
 | sort tmp - > gb.metadata
 if [ -e $ncbiDir/lineage_report.csv ]; then
     echo Getting GenBank Pangolin lineages from $ncbiDir/lineage_report.csv
     tail -n+2  $ncbiDir/lineage_report.csv \
     | sed -re 's/^([A-Z][A-Z][0-9]{6}\.[0-9]+)[^,]*/\1/;' \
     | awk -F, '$2 != "" && $2 != "None" {print $1 "\t" $2;}' \
-    | sort \
+    | sort -u \
         > gbToLineage
 else
     echo Getting GenBank Pangolin lineages from $prevMeta
     zcat $prevMeta \
     | tail -n+2 \
     | tawk '$2 != "" && $8 != "" { print $2, $8; }' \
-    | sort \
+    | sort -u \
         > gbToLineage
 fi
 wc -l gbToLineage
 if [ -e $ncbiDir/nextclade.tsv ]; then
-    sort $ncbiDir/nextclade.tsv > gbToNextclade
+    sort -u $ncbiDir/nextclade.tsv > gbToNextclade
 else
     touch gbToNextclade
 fi
 wc -l gbToNextclade
 join -t$'\t' -a 1 gb.metadata gbToNextclade \
 | join -t$'\t' -a 1 - gbToLineage \
 | tawk '{ if ($2 == "") { $2 = "?"; }
           print $1, $1, $2, $3, $4, "", $6, $7, $8; }' \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
+| uniq \
     >> gisaidAndPublic.$today.metadata.tsv
 # COG-UK metadata:
 if [ -e $cogUkDir/nextclade.tsv ]; then
     sort $cogUkDir/nextclade.tsv > cogUkToNextclade
 else
     touch cogUkToNextclade
 fi
 #*** Could also add sequence length to metadata from faSizes output...
 tail -n+2 $cogUkDir/cog_metadata.csv \
 | awk -F, -v 'OFS=\t' '{print $1, "", $5, $3, "", "", "", $7; }' \
 | sed -re 's/UK-ENG/England/; s/UK-NIR/Northern Ireland/; s/UK-SCT/Scotland/; s/UK-WLS/Wales/;' \
 | sort \
 | join -t$'\t' -a 1 -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2,1.8 - cogUkToNextclade \
 | join -t$'\t' -o 1.2,2.2,2.3,2.4,2.5,2.6,2.7,2.8,2.9 idToName - \
     >> gisaidAndPublic.$today.metadata.tsv