src/hg/utils/otto/sarscov2phylo/updateIdMapping.sh 7a2d50434d3fac99adc005ed3a68f9f00f92f9bf

7a2d50434d3fac99adc005ed3a68f9f00f92f9bf
angie
  Wed Mar 3 18:43:53 2021 -0800
Let some other script call getCogUk.sh and getNcbi.sh.  Preserve ncov-ingest mappings in case of duplicates.  Update local nextmeta-ish file with ID mapping when done.

diff --git src/hg/utils/otto/sarscov2phylo/updateIdMapping.sh src/hg/utils/otto/sarscov2phylo/updateIdMapping.sh
index 00ffc31..ecf96e6 100755
--- src/hg/utils/otto/sarscov2phylo/updateIdMapping.sh
+++ src/hg/utils/otto/sarscov2phylo/updateIdMapping.sh
@@ -14,53 +14,79 @@
 fi
 
 nextmeta=$1
 nextfasta=$2
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 source $scriptDir/util.sh
 
 today=$(date +%F)
 ottoDir=/hive/data/outside/otto/sarscov2phylo
 mapScriptDir=~/chris_ncov
 # Should use a better location than this...
 installDir=/hive/users/angie/gisaid
 
 ncbiDir=$ottoDir/ncbi.$today
-mkdir -p $ncbiDir
-cd $ncbiDir
-$scriptDir/getNcbi.sh >& getNcbi.log
-
 cogUkDir=$ottoDir/cogUk.$today
-mkdir -p $cogUkDir
-cd $cogUkDir
-$scriptDir/getCogUk.sh >& getCogUk.log
-
 # Last time I checked, CNCB had not updated since September, just keep using what we have
 cncbDir=$ottoDir/cncb.latest
 
 # Set up input files for Chris's scripts to map GISAID <--> public sequences
 cd $mapScriptDir
+rm -rf input/$today
 mkdir input/$today
 cd input/$today
 ln -sf $cncbDir/cncb.nonGenBank.fasta .
 ln -sf $ncbiDir/genbank.fa.xz .
 ln -sf $cogUkDir/cog_all.fasta.xz .
 ln -sf $nextfasta .
 xcat $nextmeta | tail -n+2 | cut -f1,3 | uniq > seqToEpi
 
 cd $mapScriptDir
-./build.sh -t $today
+time ./build.sh -t $today
 
 cd $installDir
 
 gbToDate=$ncbiDir/gbToDate
 cogUkToDate=$cogUkDir/cogUkToDate
 cncbToDate=$cncbDir/cncbToDate
 
 join -t$'\t' -a 1 -1 2 -o 1.1,1.2,1.3,2.2 \
     <(sort -k2,2 ~/chris_ncov/epiToPublicIdName.$today.txt) \
     <(sort $gbToDate $cncbToDate $cogUkToDate) \
-| sort \
+| sort -u adders - \
     > epiToPublicAndDate.$today
 
+# Look for duplicate ID swizzles relative to existing ncov-ingest mappings
+cut -f 1,2 epiToPublicAndDate.$today \
+| egrep $'\t''[A-Z][A-Z][0-9]+\.[0-9]+' \
+| sort > latestEpiToGb
+tail -n+2 ~/github/ncov-ingest/source-data/accessions.tsv \
+| tawk '{print $2, $1;}' \
+| sort > ncovEpiToGb
+wc -l latestEpiToGb ncovEpiToGb
+# Replace disagreeing IDs with ncov-ingest IDs.
+join -t$'\t' ncovEpiToGb latestEpiToGb \
+| tawk '$2 != $3 {print "s/"$3"/"$2"/;";}' \
+| sed -re 's/\./\\./;' \
+    > fix.sed
+sed -f fix.sed epiToPublicAndDate.$today > tmp
+mv tmp epiToPublicAndDate.$today
+
 ln -sf epiToPublicAndDate.$today epiToPublicAndDate.latest
+
+# Now that mapping is updated, add GenBank accessions to 4th column of metadata
+zcat $nextmeta | cut -f 4 | grep ... | wc -l
+set +o pipefail
+zcat $nextmeta | head -1 > tmp
+set -o pipefail
+zcat $nextmeta \
+| tail -n+2 \
+| sort -t $'\t' -k3,3 \
+| join -t$'\t' -a 1 -1 3 - <(tawk '$2 ~ /^[A-Z][A-Z][0-9]+/' epiToPublicAndDate.latest) \
+    -o 1.1,1.2,1.3,2.2,1.5,1.6,1.7,1.8,1.9,1.10,1.11,1.12,1.13,1.14,1.15,1.16,1.17,1.18,1.19,1.20,1.21,1.22,1.23,1.24,1.25,1.26,1.27 \
+| sort \
+    >> tmp
+wc -l tmp
+gzip -c tmp > $nextmeta
+rm tmp
+zcat $nextmeta | cut -f 4 | grep ... | wc -l