abdea7a81b3d511adaee6300277721a804f5581f angie Tue Mar 30 11:57:47 2021 -0700 Ongoing fixes for odd characters in sequence names and other pipeline tweaks. diff --git src/hg/utils/otto/sarscov2phylo/util.sh src/hg/utils/otto/sarscov2phylo/util.sh index e64b80d..1373dbb 100755 --- src/hg/utils/otto/sarscov2phylo/util.sh +++ src/hg/utils/otto/sarscov2phylo/util.sh @@ -34,30 +34,30 @@ | sed -re '/^[A-Z]+$/bx; s@[A-Za-z0-9]+ [a-z]*protein.*@@; :x;' \ | sed -re 's@(( genomic)? RNA)?, ((nearly )?complete|partial) genome$@@;' \ | sed -re 's@genome assembly(, complete genome)?: monopartite$@@;' \ | sed -re 's@ (1 |nasopharyngeal )?genome assembly, chromosome: .*$@@;' \ | sed -re 's@, complete sequence@@;' \ | sed -re 's@humans, [A-Za-z]+,( [0-9]+ Years old)?( Adult)?/@@' \ | sed -re 's@hCo[vV]-19/@@;' \ | sed -re 's@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@;' \ | sed -re 's@SARS-CoV-2/(environment|ENV)/@env/@;' \ | sed -re 's@SARS-CoV-2/Felis catus/@cat/@;' \ | sed -re 's@SARS-CoV-2/Panthera leo/@lion/@;' \ | sed -re 's@SARS-CoV-2/Panthera tigris/@tiger/@;' \ | sed -re 's@SARS-CoV-2/@@;' \ | sed -re 's@BetaCoV/@@;' \ | sed -re 's@Homo sapines/@@;' \ - | sed -re 's@ \| @ \|@; s@ $@@; s@ @ @;' \ + | sed -re 's@ \| @ \|@; s@ $@@; s@[:,]@ @g; s@ @ @g; s@[()]@@g;' \ | sed -re 's@ \|@\t@;' # Got rid of this: s/ ([^|])/_\1/g; } export -f cleanGenbank cleanCncb () { sed -re "s@^BetaCoV/@@; s@^hCoV-19/@@; s@^SARS-CoV-2/@@; s@^human/@@; s@ *\| *@\t@;" } export -f cleanCncb