b226cb3af001057dea4635de2d4c6c52f0ebe047 angie Sat Jun 20 14:34:08 2020 -0700 New scripts & vcf module for working with non-Nextstrain VCF and trees, e.g. Rob Lanfear's 40k sample build. Updates to other VCF & tree utils. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/virusNames.py src/hg/utils/otto/nextstrainNcov/virusNames.py index eee7ba1..1d60164 100644 --- src/hg/utils/otto/nextstrainNcov/virusNames.py +++ src/hg/utils/otto/nextstrainNcov/virusNames.py @@ -1,37 +1,37 @@ # Every group uses slightly different names for their sequences/trees, # and then we went ahead and made our own gloms for Nextstrain VCF. # But there are some common components of SARS-CoV-2 sequence names that # can help cross-reference them: # * EPI_ISL_\d+ # * Country/local-ID/20(19|20) import logging import re from collections import defaultdict # Regular expressions for picking out name components # GISAID ID: -epiRe = re.compile('.*?(EPI_ISL_\d+)') +epiRe = re.compile(r'.*?(EPI_ISL_\d+)') # Slash-separated country/localId/year name often shared by GISAID, NCBI, CNCB, COG-UK: -slashRe = re.compile('.*?(\w+(\/([\w.-]+)\/20\d\d?))') +slashRe = re.compile(r'.*?(\w+(\/([\w.-]+)\/20\d\d?\b))') # Slash-separated but with slashes replaced by underscores: -undRe = re.compile('.*?([A-Za-z]+)_([\w.-]+?)_(20\d\d?)') +undRe = re.compile(r'.*?([A-Za-z]+)_([\w.-]+?)_(20\d\d?\b)') # Slash-sep with underscores in country name -slashUndRe = re.compile('.*?([A-Za-z]+_[A-Za-z]+\w+)(\/[\w-]+\/20\d\d?)') +slashUndRe = re.compile(r'.*?([A-Za-z]+_[A-Za-z]+\w+)(\/[\w-]+\/20\d\d?\b)') # AZ-TGEN-TG or just AZ-TG? -azTgenRe = re.compile('.*?USA\/AZ-TG\d+\/20\d\d?') +azTgenRe = re.compile(r'.*?USA\/AZ-TG\d+\/20\d\d?') def makeIdLookup(seqNames): """Return a dict mapping sequence names, and components of those names like 'EPI_ISL_402121' and 'Wuhan/IVDC-HB-05/2019' from 'EPI_ISL_402121|Wuhan/IVDC-HB-05/2019|Dec30', to those sequence names, so that we can attempt to map a different set of names to seqNames even if the other names have only a component in common.""" idLookup = defaultdict(list) for seqName in seqNames: # Map seqName to itself in case names happen to be identical idLookup[seqName].append(seqName) # Look for EPI_ISL_ component: epiMatch = epiRe.match(seqName) if (epiMatch): epiId = epiMatch.groups()[0] idLookup[epiId].append(seqName) @@ -39,31 +39,31 @@ slashMatch = slashRe.match(seqName) if (slashMatch): slashName, localIdYear, localId = slashMatch.groups() idLookup[slashName].append(seqName) # Sometimes a different country name is used, but the local IDs tend to be # pretty distinctive (except in cases where they're just a number), so # add just /localId/year too. if (not localId.isdigit()): idLookup[localIdYear].append(seqName) else: if ('|Wuhan-Hu-1/2019' in seqName): # Nextstrain uses countryless "Wuhan-Hu-1/2019", COG-UK uses "China/Wuhan-Hu-1/2019" idLookup['China/Wuhan-Hu-1/2019'].append(seqName) idLookup['Wuhan-Hu-1'].append(seqName) else: - logging.warn('No slashMatch for "' + seqName + '"') + logging.debug('No slashMatch for "' + seqName + '"') return idLookup def checkEpiIds(resultList, origEpiMatch, label): """Watch out for some instances of the same slash-separated names having different EPI IDs (same sample, different sequences?)""" if (origEpiMatch): okResults = [] for result in resultList: resultEpiMatch = epiRe.match(result) if (resultEpiMatch and origEpiMatch.groups()[0] != resultEpiMatch.groups()[0]): logging.warn("Tree label '" + label + "' and VCF result '" + result + "' were joined by a component but have different EPI IDs; " "ignoring."); else: okResults.append(result)