src/hg/utils/otto/nextstrainNcov/virusNames.py b226cb3af001057dea4635de2d4c6c52f0ebe047

b226cb3af001057dea4635de2d4c6c52f0ebe047
angie
  Sat Jun 20 14:34:08 2020 -0700
New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build.  Updates to other VCF & tree utils.  sorta refs #25278, #25382

diff --git src/hg/utils/otto/nextstrainNcov/virusNames.py src/hg/utils/otto/nextstrainNcov/virusNames.py
index eee7ba1..1d60164 100644
--- src/hg/utils/otto/nextstrainNcov/virusNames.py
+++ src/hg/utils/otto/nextstrainNcov/virusNames.py
@@ -1,37 +1,37 @@
 # Every group uses slightly different names for their sequences/trees,
 # and then we went ahead and made our own gloms for Nextstrain VCF.
 # But there are some common components of SARS-CoV-2 sequence names that
 # can help cross-reference them:
 # * EPI_ISL_\d+
 # * Country/local-ID/20(19|20)
 
 import logging
 import re
 from collections import defaultdict
 
 # Regular expressions for picking out name components
 # GISAID ID:
-epiRe = re.compile('.*?(EPI_ISL_\d+)')
+epiRe = re.compile(r'.*?(EPI_ISL_\d+)')
 # Slash-separated country/localId/year name often shared by GISAID, NCBI, CNCB, COG-UK:
-slashRe = re.compile('.*?(\w+(\/([\w.-]+)\/20\d\d?))')
+slashRe = re.compile(r'.*?(\w+(\/([\w.-]+)\/20\d\d?\b))')
 # Slash-separated but with slashes replaced by underscores:
-undRe = re.compile('.*?([A-Za-z]+)_([\w.-]+?)_(20\d\d?)')
+undRe = re.compile(r'.*?([A-Za-z]+)_([\w.-]+?)_(20\d\d?\b)')
 # Slash-sep with underscores in country name
-slashUndRe = re.compile('.*?([A-Za-z]+_[A-Za-z]+\w+)(\/[\w-]+\/20\d\d?)')
+slashUndRe = re.compile(r'.*?([A-Za-z]+_[A-Za-z]+\w+)(\/[\w-]+\/20\d\d?\b)')
 # AZ-TGEN-TG or just AZ-TG?
-azTgenRe = re.compile('.*?USA\/AZ-TG\d+\/20\d\d?')
+azTgenRe = re.compile(r'.*?USA\/AZ-TG\d+\/20\d\d?')
 
 def makeIdLookup(seqNames):
     """Return a dict mapping sequence names, and components of those names like
     'EPI_ISL_402121' and 'Wuhan/IVDC-HB-05/2019' from 'EPI_ISL_402121|Wuhan/IVDC-HB-05/2019|Dec30',
     to those sequence names, so that we can attempt to map a different set of names to seqNames
     even if the other names have only a component in common."""
     idLookup = defaultdict(list)
     for seqName in seqNames:
         # Map seqName to itself in case names happen to be identical
         idLookup[seqName].append(seqName)
         # Look for EPI_ISL_ component:
         epiMatch = epiRe.match(seqName)
         if (epiMatch):
             epiId = epiMatch.groups()[0]
             idLookup[epiId].append(seqName)
@@ -39,31 +39,31 @@
         slashMatch = slashRe.match(seqName)
         if (slashMatch):
             slashName, localIdYear, localId = slashMatch.groups()
             idLookup[slashName].append(seqName)
             # Sometimes a different country name is used, but the local IDs tend to be
             # pretty distinctive (except in cases where they're just a number), so
             # add just /localId/year too.
             if (not localId.isdigit()):
                 idLookup[localIdYear].append(seqName)
         else:
             if ('|Wuhan-Hu-1/2019' in seqName):
                 # Nextstrain uses countryless "Wuhan-Hu-1/2019", COG-UK uses "China/Wuhan-Hu-1/2019"
                 idLookup['China/Wuhan-Hu-1/2019'].append(seqName)
                 idLookup['Wuhan-Hu-1'].append(seqName)
             else:
-                logging.warn('No slashMatch for "' + seqName + '"')
+                logging.debug('No slashMatch for "' + seqName + '"')
     return idLookup
 
 def checkEpiIds(resultList, origEpiMatch, label):
     """Watch out for some instances of the same slash-separated names having
     different EPI IDs (same sample, different sequences?)"""
     if (origEpiMatch):
         okResults = []
         for result in resultList:
             resultEpiMatch = epiRe.match(result)
             if (resultEpiMatch and origEpiMatch.groups()[0] != resultEpiMatch.groups()[0]):
                 logging.warn("Tree label '" + label + "' and VCF result '" + result +
                              "' were joined by a component but have different EPI IDs; "
                              "ignoring.");
             else:
                 okResults.append(result)