src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh be4311c07e14feb728abc6425ee606ffaa611a58

be4311c07e14feb728abc6425ee606ffaa611a58
markd
  Fri Jan 22 06:46:58 2021 -0800
merge with master

diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
index e7a49fb..bace556 100755
--- src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
+++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh
@@ -35,31 +35,31 @@
 renaming=unmapped.renaming
 
 scriptDir=$(dirname "${BASH_SOURCE[0]}")
 
 source $scriptDir/util.sh
 
 # faFilter discards fasta header info after the first word, so before running it,
 # make files that associate IDs and strain names.
 fastaNames $genbankFa | cleanGenbank | sort > gbAccName
 # COG-UK has only names, so no need to associate anything.
 # CNCB puts name first, then ID, so swap the order:
 fastaNames $cncbFa | cleanCncb \
 | tawk '{print $2, $1;}' | sort > cncbAccName
 
 # Filter minSize and exclude sequences that were mapped to GISAID IDs
-join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds
+join -t$'\t' epi.renaming $epiToPublic | cut -f 3 > mappedIds
 xcat $genbankFa \
 | faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa
 fastaSeqCount genbank.unmapped.fa
 #*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names)
 xcat $cogUkFa \
 faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa
 fastaSeqCount cogUk.unmapped.fa
 #*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names)
 # Tweak CNCB's fasta headers around to use acc not name:
 xcat $cncbFa \
 | sed -re 's/^>.*\| */>/;' \
 | faFilter -minSize=$minSize stdin stdout \
 | faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa