70e001e0f7a71f53c7a70b45feabac07d9d3751b angie Mon Dec 14 16:35:16 2020 -0800 Update sarscov2phylo process to use tree and multi-seq ali's downloaded from GISAID instead of tree from github and aligning nextfasta locally. diff --git src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh index e7a49fb..bace556 100755 --- src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh +++ src/hg/utils/otto/sarscov2phylo/extractUnmappedPublic.sh @@ -35,31 +35,31 @@ renaming=unmapped.renaming scriptDir=$(dirname "${BASH_SOURCE[0]}") source $scriptDir/util.sh # faFilter discards fasta header info after the first word, so before running it, # make files that associate IDs and strain names. fastaNames $genbankFa | cleanGenbank | sort > gbAccName # COG-UK has only names, so no need to associate anything. # CNCB puts name first, then ID, so swap the order: fastaNames $cncbFa | cleanCncb \ | tawk '{print $2, $1;}' | sort > cncbAccName # Filter minSize and exclude sequences that were mapped to GISAID IDs -join -t$'\t' tree.renaming $epiToPublic | cut -f 3 > mappedIds +join -t$'\t' epi.renaming $epiToPublic | cut -f 3 > mappedIds xcat $genbankFa \ | faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds genbank.unmapped.fa fastaSeqCount genbank.unmapped.fa #*** TODO: also exclude COG-UK sequences that are in GenBank (some with incomplete names) xcat $cogUkFa \ faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds cogUk.unmapped.fa fastaSeqCount cogUk.unmapped.fa #*** TODO: also exclude CNCB sequences that are in GenBank (some with incomplete names) # Tweak CNCB's fasta headers around to use acc not name: xcat $cncbFa \ | sed -re 's/^>.*\| */>/;' \ | faFilter -minSize=$minSize stdin stdout \ | faSomeRecords -exclude stdin mappedIds cncb.unmapped.fa