ee396acb1919923fb78c5b91b6b17789efba3ca3
angie
  Fri Aug 26 11:40:49 2022 -0700
More regex-cleaning of outlandish things that people put in GenBank titles.

diff --git src/hg/utils/otto/sarscov2phylo/util.sh src/hg/utils/otto/sarscov2phylo/util.sh
index 605681b..baf4574 100755
--- src/hg/utils/otto/sarscov2phylo/util.sh
+++ src/hg/utils/otto/sarscov2phylo/util.sh
@@ -1,80 +1,80 @@
 #!/bin/bash
 
 if [ $(hostname) == "hgwdev" ]; then
     export TMPDIR=/data/tmp
 fi
 
 # Define some handy functions for other bash scripts in this directory
 
 xcat () {
     fasta=$1
     if [ "${fasta##*.}" == "xz" ]; then
         xzcat $fasta
     elif [ "${fasta##*.}" == "gz" ]; then
         zcat $fasta
     else
         cat $fasta
     fi
 }
 export -f xcat
 
 fastaNames () {
     xcat $1 \
     | grep ^\> | sed -re 's/^>//;'
 }
 export -f fastaNames
 
 fastaSeqCount () {
     xcat $1 \
     | grep ^\> | wc -l
 }
 export -f fastaSeqCount
 
 cleanGenbank () {
     sed -re 's@Severe acute respiratory syndrome coronavirus 2 SARS-CoV-2/@@;' $* \
     | sed -re 's@Severe acute respiratory syndrome coronavirus 2 isolate SARS[ -]Co[Vv]-2/(human|homo ?sapiens)/@@;' \
     | sed -re 's@Mutant Severe acute respiratory syndrome coronavirus 2 clone SARS-CoV-2[_-]@@;' \
     | sed -re 's@Severe acute respiratory syndrome coronavirus 2( isolate)?( 2019_nCoV)?@@;' \
     | sed -re 's@Enter each isolate name here.*@@;' \
     | sed -re '/^[A-Z]+$/bx; s@[A-Za-z0-9]+ [a-z]*protein.*@@; :x;' \
     | sed -re 's@(( genomic)? RNA)?, ((nearly )?complete|partial) genome$@@;' \
     | sed -re 's@genome assembly(, complete genome)?: monopartite$@@;' \
     | sed -re 's@ (1 |nasopharyngeal )?genome assembly, chromosome: .*$@@;' \
     | sed -re 's@, complete sequence@@;' \
     | sed -re 's@humans, [A-Za-z]+,( [0-9]+ Years old)?( Adult)?/@@' \
     | sed -re 's@hCo[vV]-19/@@;' \
-    | sed -re 's@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)/@@;' \
+    | sed -re 's@SARS?-CoV-?2/([Hh]umai?ns?|[Hh]o[mw]o ?sapiens?)[^/]*/@@;' \
     | sed -re 's@SARS-CoV-2/HUMAN/@@;' \
     | sed -re 's@SARS-CoV-2/([Ee]nvironment|ENV)/@env/@;' \
     | sed -re 's@SARS-CoV-2/Canis lupus familiaris/@dog/@;' \
     | sed -re 's@SARS-CoV-2/Felis [Cc]atus/@cat/@;' \
     | sed -re 's@SARS-CoV-2/Panthera leo/@lion/@;' \
     | sed -re 's@SARS-CoV-2/Panthera tigris/@tiger/@;' \
     | sed -re 's@SARS-CoV-2/@@;' \
     | sed -re 's@BetaCoV/@@;' \
     | sed -re 's@Homo sapines/@@;' \
     | sed -re 's@ \| @ \|@; s@ $@@; s@[;:,]@ @g; s@  @ @g; s@[()]@@g;' \
     | sed -re 's@ \|@\t@;'
 # Got rid of this:   s/ ([^|])/_\1/g;
 }
 export -f cleanGenbank
 
 cleanCncb () {
     sed -re "s@^BetaCoV/@@;
              s@^hCoV-19/@@;
              s@^SARS-CoV-2/@@;
              s@^human/@@;
              s@ *\| *@\t@;"
 }
 export -f cleanCncb
 
 vcfSamples () {
     set +o pipefail
     xcat $1 \
     | head \
     | grep ^#CHROM \
     | sed -re 's/\t/\n/g;' \
     | tail -n+10
     set -o pipefail
 }
 export -f vcfSamples