be4311c07e14feb728abc6425ee606ffaa611a58 markd Fri Jan 22 06:46:58 2021 -0800 merge with master diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh index e7a49fb..bace556 100755 --- src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh +++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh @@ -35,31 +35,31 @@ renaming=unmapped.renaming scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh # faFilter discards fasta header info after the first word, so before running it, # make files that associate IDs and strain names. fastaNames $genbankFa | cleanGenbank | sort > gbAccName # COG-UK has only names, so no need to associate anything. # CNCB puts name first, then ID, so swap the order: fastaNames $cncbFa | cleanCncb \ | tawk '{print $2, $1;}' | sort > cncbAccName # Filter minSize and exclude sequences that were mapped to GISAID IDs -join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds +join -t$'\t' epi.renaming $epiToPublic | cut -f 3 > mappedIds xcat $genbankFa \ | faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa fastaSeqCount genbank.unmapped.fa #*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names) xcat $cogUkFa \ faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa fastaSeqCount cogUk.unmapped.fa #*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names) # Tweak CNCB's fasta headers around to use acc not name: xcat $cncbFa \ | sed -re 's/^>.*\| */>/;' \ | faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa