src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh 70e001e0f7a71f53c7a70b45feabac07d9d3751b

70e001e0f7a71f53c7a70b45feabac07d9d3751b
angie
  Mon Dec 14 16:35:16 2020 -0800
Update sarscov2phylo process to use tree and multi-seq ali's downloaded from GISAID instead of tree from github and aligning nextfasta locally.

diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
index e7a49fb..bace556 100755
--- src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
+++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
@@ -35,31 +35,31 @@
 renaming=unmapped.renaming
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 source $scriptDir/util.sh
 
 # faFilter discards fasta header info after the first word, so before running it,
 # make files that associate IDs and strain names.
 fastaNames $genbankFa | cleanGenbank | sort > gbAccName
 # COG-UK has only names, so no need to associate anything.
 # CNCB puts name first, then ID, so swap the order:
 fastaNames $cncbFa | cleanCncb \
 | tawk '{print $2, $1;}' | sort > cncbAccName
 
 # Filter minSize and exclude sequences that were mapped to GISAID IDs
-join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds
+join -t$'\t' epi.renaming $epiToPublic | cut -f 3 > mappedIds
 xcat $genbankFa \
 | faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa
 fastaSeqCount genbank.unmapped.fa
 #*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names)
 xcat $cogUkFa \
 faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa
 fastaSeqCount cogUk.unmapped.fa
 #*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names)
 # Tweak CNCB's fasta headers around to use acc not name:
 xcat $cncbFa \
 | sed -re 's/^>.*\| */>/;' \
 | faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa