47541d92430fc9e07cd0bab4ba386faaa1f3d76c
angie
  Sat Dec 23 18:27:34 2023 -0800
Remove duplicates between COG-UK and INSDC.  Fix name format that omitted INSDC accession.  Get INSDC accessions for COG-UK sequences in lineageProposalsRecombinants file.

diff --git src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
index d4c8ed0..39cd74e 100755
--- src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
+++ src/hg/utils/otto/sarscov2phylo/makeNewMaskedVcf.sh
@@ -86,47 +86,59 @@
 cut -f 1 prevIdToNameDeDup | grep -E '^(England|Northern|Scotland|Wales)' > cog.acc
 comm -13 <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1 | sort) cog.acc > cog.removed.acc
 if [ -s cog.removed.acc ]; then
     grep -Fwf cog.removed.acc prevIdToName \
     | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_)' \
     | cut -f 2 >> prevNameToRemove
 fi
 cut -f 1 prevIdToNameDeDup \
 | grep -vE '^([A-Z]{2}[0-9]{6}\.[0-9]+|EPI_ISL_|England|Northern|Scotland|Wales)' \
     > cncb.acc
 comm -13 <(cut -f 2 $cncbDir/cncb.metadata.tsv | sort) cncb.acc > cncb.removed.acc
 if [ -s cncb.removed.acc ]; then
     grep -Fwf cncb.removed.acc prevIdToName | cut -f 2 >> prevNameToRemove
 fi
 
+# If a sequence is in both INSDC and COG-UK, keep the INSDC and remove the COG-UK as dup.
+set +o pipefail
+cut -f 6 $ncbiDir/ncbi_dataset.plusBioSample.tsv \
+| grep ^COG-UK/ \
+| sed -re 's@COG-UK/@@' \
+| grep -Fwf - prevNames \
+| grep -vE '\|[A-Z]{2}[0-9]{6}\.[0-9]+\|' \
+| grep -vE '\|EPI_ISL_[0-9]+\|' \
+| cat \
+    >> prevNameToRemove
+set -o pipefail
+
 # Remove duplicates and withdrawn sequences
 if [ -s prevNameToRemove ]; then
     rm -f prevDedup.pb
     $matUtils extract -i $baseProtobuf \
         -p -s <(sort -u prevNameToRemove) \
         -u prevDedupNames \
         -o prevDedup.pb
 else
     ln -sf $baseProtobuf prevDedup.pb
     $matUtils extract -i prevDedup.pb -u prevDedupNames
 fi
 
 function gbAccCogRenaming {
     # pipeline: one INSDC accession per line of stdin, acc to full name if COG-UK on stdout
     grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
     | grep COG-UK/ \
-    | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6  "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|?"; } } }' \
+    | tawk '{ if ($4 != "") { print $1, $4 "/" $6 "/" $3 "|" $1 "|" $3; } else { if ($3 != "") { print $1, $6  "/" $3 "|" $1 "|" $3; } else { print $1, $6 "|" $1 "|?"; } } }' \
     | sed -re 's@COG-UK/@@g; s/United Kingdom://; s/(\/[0-9]{4})(-[0-9]+)*/\1/; s/ //g;'
 }
 
 function gbAccNonCogRenaming {
     # pipeline: one INSDC accession per line of stdin, acc to full name if non-COG-UK on stdout
     grep -Fwf - $ncbiDir/ncbi_dataset.plusBioSample.tsv \
     | grep -v COG-UK/ \
     | cleanGenbank \
     | tawk '{ if ($3 == "") { $3 = "?"; }
               if ($6 != "") { print $1 "\t" $6 "|" $1 "|" $3; }
               else { print $1 "\t" $1 "|" $3; } }' \
     | sed -re 's/ /_/g'
 }
 
 # To update names that have changed and simplify detection of new sequences to add, relate to acc.
@@ -194,35 +206,40 @@
     > dropoutContam.ids
 # Also exclude sequences with unbelievably low numbers of mutations given sampling dates.
 zcat $gisaidDir/chunks/nextclade.full.tsv.gz | cut -f 1,10 \
 | awk -F\| '{ if ($3 == "") { print $1 "\t" $2; } else { print $2 "\t" $3; } }' \
 | $scriptDir/findRefBackfill.pl > gisaid.refBackfill
 zcat $ncbiDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
 | join -t $'\t' <(cut -f 1,3 $ncbiDir/ncbi_dataset.plusBioSample.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > gb.refBackfill
 zcat $cogUkDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
 | join -t $'\t' <(zcat $cogUkDir/cog_metadata.csv.gz | cut -d, -f 1,5 | tr , $'\t' | sort) - \
 | $scriptDir/findRefBackfill.pl > cog.refBackfill
 zcat $cncbDir/nextclade.full.tsv.gz | cut -f 1,10 | sort \
 | join -t$'\t' <(cut -f 2,10 $cncbDir/cncb.metadata.tsv | sort) - \
 | $scriptDir/findRefBackfill.pl > cncb.refBackfill
 cut -f 1 *.refBackfill > refBackfill.ids
+curl -sS $lineageProposalsRecombinants  | tail -n+2 | cut -f 1 \
+| sed -re 's@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@;
+           s/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;' \
+    > tmp
+grep -Fwf tmp $epiToPublic | cut -f 2 | grep -E '^[A-Z]{2}[0-9]{6}' > tmp2
+sort -u tmp tmp2 > lpRecombinantIds
+rm tmp tmp2
 sort -u ../tooManyEpps.ids ../badBranchSeed.ids dropoutContam.ids refBackfill.ids \
 | grep -vFwf <(tail -n+2 $scriptDir/includeRecombinants.tsv | cut -f 1) \
-| grep -vFwf <(curl -sS $lineageProposalsRecombinants  | tail -n+2 | cut -f 1 \
-               | sed -re 's/.*(EPI_ISL_[0-9]+|[A-Z]{2}[0-9]+{6}(\.[0-9]+)?).*/\1/;
-                          s@(England|Northern[ _]?Ireland|Scotland|Wales)/([A-Z0-9_-]+).*@\2@;') \
+| grep -vFwf lpRecombinantIds \
     > exclude.ids
 
 # Get new GenBank sequences with at least $minReal non-N bases.
 xzcat $ncbiDir/genbank.fa.xz \
 | faSomeRecords -exclude stdin <(cat prevGbAcc exclude.ids) newGenBank.fa
 faSize -veryDetailed newGenBank.fa \
 | tawk '$4 < '$minReal' {print $1;}' \
     > gbTooSmall
 # NCBI also includes NC_045512 in the download, but that's our reference, so... exclude that too.
 echo NC_045512.2 >> gbTooSmall
 faSomeRecords -exclude newGenBank.fa gbTooSmall newGenBank.filtered.fa
 faSize newGenBank.filtered.fa
 
 # Get new COG-UK sequences with at least $minReal non-N bases.
 # Also exclude cog_all.fasta sequences not found in cog_metadata.csv.