c218231e30bb4cbc6e6536738585b3df1b3f673c
angie
  Wed Mar 17 11:51:18 2021 -0700
Make cleanGenbank ~10x more efficient by splitting out sed commands to take advantage of many cores, and adding a test and branch to the slowest command.

diff --git src/hg/utils/otto/sarscov2phylo/util.sh src/hg/utils/otto/sarscov2phylo/util.sh
index 416a96d..e64b80d 100755
--- src/hg/utils/otto/sarscov2phylo/util.sh
+++ src/hg/utils/otto/sarscov2phylo/util.sh
@@ -15,48 +15,49 @@
 export -f xcat
 
 fastaNames () {
     xcat $1 \
     | grep ^\> | sed -re 's/^>//;'
 }
 export -f fastaNames
 
 fastaSeqCount () {
     xcat $1 \
     | grep ^\> | wc -l
 }
 export -f fastaSeqCount
 
 cleanGenbank () {
-    sed -re "s@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@;
-             s@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@;
-             s@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@;
-             s@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@;
-             s@[A-Za-z0-9]+ [a-z]*protein.*@@;
-             s@(( genomic)? RNA)?, ((nearly )?complete|partial) genome\$@@;
-             s@genome assembly(, complete genome)?: monopartite\$@@;
-             s@ (1 |nasopharyngeal )?genome assembly, chromosome: .*\$@@;
-             s@, complete sequence@@;
-             s@hCo[vV]-19/@@;
-             s@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@;
-             s@SARS-CoV-2/(environment|ENV)/@env/@;
-             s@SARS-CoV-2/Felis catus/@cat/@;
-             s@SARS-CoV-2/Panthera leo/@lion/@;
-             s@SARS-CoV-2/Panthera tigris/@tiger/@;
-             s@SARS-CoV-2/@@;
-             s@BetaCoV/@@;
-             s@Homo sapines/@@;
-             s@ \| @ \|@; s@ \$@@; s@  @ @;
-             s@ \|@\t@;"
+    sed -re 's@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@;' $* \
+    | sed -re 's@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@;' \
+    | sed -re 's@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@;' \
+    | sed -re 's@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@;' \
+    | sed -re '/^[A-Z]+$/bx; s@[A-Za-z0-9]+ [a-z]*protein.*@@; :x;' \
+    | sed -re 's@(( genomic)? RNA)?, ((nearly )?complete|partial) genome$@@;' \
+    | sed -re 's@genome assembly(, complete genome)?: monopartite$@@;' \
+    | sed -re 's@ (1 |nasopharyngeal )?genome assembly, chromosome: .*$@@;' \
+    | sed -re 's@, complete sequence@@;' \
+    | sed -re 's@humans, [A-Za-z]+,( [0-9]+ Years old)?( Adult)?/@@' \
+    | sed -re 's@hCo[vV]-19/@@;' \
+    | sed -re 's@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@;' \
+    | sed -re 's@SARS-CoV-2/(environment|ENV)/@env/@;' \
+    | sed -re 's@SARS-CoV-2/Felis catus/@cat/@;' \
+    | sed -re 's@SARS-CoV-2/Panthera leo/@lion/@;' \
+    | sed -re 's@SARS-CoV-2/Panthera tigris/@tiger/@;' \
+    | sed -re 's@SARS-CoV-2/@@;' \
+    | sed -re 's@BetaCoV/@@;' \
+    | sed -re 's@Homo sapines/@@;' \
+    | sed -re 's@ \| @ \|@; s@ $@@; s@  @ @;' \
+    | sed -re 's@ \|@\t@;'
 # Got rid of this:   s/ ([^|])/_\1/g;
 }
 export -f cleanGenbank
 
 cleanCncb () {
     sed -re "s@^BetaCoV/@@;
              s@^hCoV-19/@@;
              s@^SARS-CoV-2/@@;
              s@^human/@@;
              s@ *\| *@\t@;"
 }
 export -f cleanCncb