c218231e30bb4cbc6e6536738585b3df1b3f673c angie Wed Mar 17 11:51:18 2021 -0700 Make cleanGenbank ~10x more efficient by splitting out sed commands to take advantage of many cores, and adding a test and branch to the slowest command. diff --git src/hg/utils/otto/sarscov2phylo/util.sh src/hg/utils/otto/sarscov2phylo/util.sh index 416a96d..e64b80d 100755 --- src/hg/utils/otto/sarscov2phylo/util.sh +++ src/hg/utils/otto/sarscov2phylo/util.sh @@ -15,48 +15,49 @@ export -f xcat fastaNames () { xcat $1 \ | grep ^\> | sed -re 's/^>//;' } export -f fastaNames fastaSeqCount () { xcat $1 \ | grep ^\> | wc -l } export -f fastaSeqCount cleanGenbank () { - sed -re "s@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@; - s@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@; - s@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@; - s@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@; - s@[A-Za-z0-9]+ [a-z]*protein.*@@; - s@(( genomic)? RNA)?, ((nearly )?complete|partial) genome\$@@; - s@genome assembly(, complete genome)?: monopartite\$@@; - s@ (1 |nasopharyngeal )?genome assembly, chromosome: .*\$@@; - s@, complete sequence@@; - s@hCo[vV]-19/@@; - s@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@; - s@SARS-CoV-2/(environment|ENV)/@env/@; - s@SARS-CoV-2/Felis catus/@cat/@; - s@SARS-CoV-2/Panthera leo/@lion/@; - s@SARS-CoV-2/Panthera tigris/@tiger/@; - s@SARS-CoV-2/@@; - s@BetaCoV/@@; - s@Homo sapines/@@; - s@ \| @ \|@; s@ \$@@; s@ @ @; - s@ \|@\t@;" + sed -re 's@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@;' $* \ + | sed -re 's@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@;' \ + | sed -re 's@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@;' \ + | sed -re 's@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@;' \ + | sed -re '/^[A-Z]+$/bx; s@[A-Za-z0-9]+ [a-z]*protein.*@@; :x;' \ + | sed -re 's@(( genomic)? RNA)?, ((nearly )?complete|partial) genome$@@;' \ + | sed -re 's@genome assembly(, complete genome)?: monopartite$@@;' \ + | sed -re 's@ (1 |nasopharyngeal )?genome assembly, chromosome: .*$@@;' \ + | sed -re 's@, complete sequence@@;' \ + | sed -re 's@humans, [A-Za-z]+,( [0-9]+ Years old)?( Adult)?/@@' \ + | sed -re 's@hCo[vV]-19/@@;' \ + | sed -re 's@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@;' \ + | sed -re 's@SARS-CoV-2/(environment|ENV)/@env/@;' \ + | sed -re 's@SARS-CoV-2/Felis catus/@cat/@;' \ + | sed -re 's@SARS-CoV-2/Panthera leo/@lion/@;' \ + | sed -re 's@SARS-CoV-2/Panthera tigris/@tiger/@;' \ + | sed -re 's@SARS-CoV-2/@@;' \ + | sed -re 's@BetaCoV/@@;' \ + | sed -re 's@Homo sapines/@@;' \ + | sed -re 's@ \| @ \|@; s@ $@@; s@ @ @;' \ + | sed -re 's@ \|@\t@;' # Got rid of this: s/ ([^|])/_\1/g; } export -f cleanGenbank cleanCncb () { sed -re "s@^BetaCoV/@@; s@^hCoV-19/@@; s@^SARS-CoV-2/@@; s@^human/@@; s@ *\| *@\t@;" } export -f cleanCncb