2e6ddaec223b5b760cbba134eee52d14a327fda3 angie Fri May 29 15:22:45 2020 -0700 Adding tree manipulation python scripts and modules that I've been working on for David et al. sorta refs #25278, #25382 diff --git src/hg/utils/otto/nextstrainNcov/virusNames.py src/hg/utils/otto/nextstrainNcov/virusNames.py new file mode 100644 index 0000000..eee7ba1 --- /dev/null +++ src/hg/utils/otto/nextstrainNcov/virusNames.py @@ -0,0 +1,132 @@ +# Every group uses slightly different names for their sequences/trees, +# and then we went ahead and made our own gloms for Nextstrain VCF. +# But there are some common components of SARS-CoV-2 sequence names that +# can help cross-reference them: +# * EPI_ISL_\d+ +# * Country/local-ID/20(19|20) + +import logging +import re +from collections import defaultdict + +# Regular expressions for picking out name components +# GISAID ID: +epiRe = re.compile('.*?(EPI_ISL_\d+)') +# Slash-separated country/localId/year name often shared by GISAID, NCBI, CNCB, COG-UK: +slashRe = re.compile('.*?(\w+(\/([\w.-]+)\/20\d\d?))') +# Slash-separated but with slashes replaced by underscores: +undRe = re.compile('.*?([A-Za-z]+)_([\w.-]+?)_(20\d\d?)') +# Slash-sep with underscores in country name +slashUndRe = re.compile('.*?([A-Za-z]+_[A-Za-z]+\w+)(\/[\w-]+\/20\d\d?)') +# AZ-TGEN-TG or just AZ-TG? +azTgenRe = re.compile('.*?USA\/AZ-TG\d+\/20\d\d?') + +def makeIdLookup(seqNames): + """Return a dict mapping sequence names, and components of those names like + 'EPI_ISL_402121' and 'Wuhan/IVDC-HB-05/2019' from 'EPI_ISL_402121|Wuhan/IVDC-HB-05/2019|Dec30', + to those sequence names, so that we can attempt to map a different set of names to seqNames + even if the other names have only a component in common.""" + idLookup = defaultdict(list) + for seqName in seqNames: + # Map seqName to itself in case names happen to be identical + idLookup[seqName].append(seqName) + # Look for EPI_ISL_ component: + epiMatch = epiRe.match(seqName) + if (epiMatch): + epiId = epiMatch.groups()[0] + idLookup[epiId].append(seqName) + # Look for Country/localId/year component: + slashMatch = slashRe.match(seqName) + if (slashMatch): + slashName, localIdYear, localId = slashMatch.groups() + idLookup[slashName].append(seqName) + # Sometimes a different country name is used, but the local IDs tend to be + # pretty distinctive (except in cases where they're just a number), so + # add just /localId/year too. + if (not localId.isdigit()): + idLookup[localIdYear].append(seqName) + else: + if ('|Wuhan-Hu-1/2019' in seqName): + # Nextstrain uses countryless "Wuhan-Hu-1/2019", COG-UK uses "China/Wuhan-Hu-1/2019" + idLookup['China/Wuhan-Hu-1/2019'].append(seqName) + idLookup['Wuhan-Hu-1'].append(seqName) + else: + logging.warn('No slashMatch for "' + seqName + '"') + return idLookup + +def checkEpiIds(resultList, origEpiMatch, label): + """Watch out for some instances of the same slash-separated names having + different EPI IDs (same sample, different sequences?)""" + if (origEpiMatch): + okResults = [] + for result in resultList: + resultEpiMatch = epiRe.match(result) + if (resultEpiMatch and origEpiMatch.groups()[0] != resultEpiMatch.groups()[0]): + logging.warn("Tree label '" + label + "' and VCF result '" + result + + "' were joined by a component but have different EPI IDs; " + "ignoring."); + else: + okResults.append(result) + if (len(okResults)): + resultList = okResults + else: + resultList = None + return resultList + +def lookupSeqName(seqName, idLookup): + """If seqName, or a component of seqName, is in idLookup, then return list of + associated name(s).""" + resultList = idLookup[seqName] + if (not resultList): + # EPI_ISL_ ID is most unambiguous + epiMatch = epiRe.match(seqName) + if (epiMatch): + epiId = epiMatch.groups()[0] + resultList = idLookup[epiId] + # Failing that, try country/localId/year + if (not resultList): + slashMatch = slashRe.match(seqName) + if (not slashMatch): + # If slashes were replaced with underscores, swap them back and try again + if (seqName == 'Wuhan-Hu-1_2019'): + resultList = idLookup['Wuhan-Hu-1'] + elif (seqName == 'canine_HongKong_20-02756_2020'): + resultList = idLookup['HongKong/20-02756/2020'] + else: + undMatch = undRe.match(seqName) + if (undMatch): + country, localId, year = undMatch.groups() + seqNameSlash = '/'.join([country, localId, year]) + slashMatch = slashRe.match(seqNameSlash) + if (slashMatch): + slashName, localIdYear, localId = slashMatch.groups() + resultList = idLookup[slashName] + if (not resultList): + resultList = idLookup[localIdYear] + resultList = checkEpiIds(resultList, epiMatch, seqName) + if (not resultList): + # Try removing underscores in the country name + slashUndMatch = slashUndRe.match(seqName) + if (slashUndMatch): + countryUnd, rest = slashUndMatch.groups() + country = countryUnd.replace('_', '') + seqNameNoUnd = country + rest + resultList = idLookup[seqNameNoUnd] + resultList = checkEpiIds(resultList, epiMatch, seqNameNoUnd) + if (not resultList): + azTgMatch = azTgenRe.match(seqName) + if (azTgMatch): + seqNameTgen = seqName.replace('AZ-TG', 'AZ-TGEN-TG') + resultList = idLookup[seqNameTgen] + resultList = checkEpiIds(resultList, epiMatch, seqNameTgen) + return resultList + +def maybeLookupSeqName(seqName, idLookup): + """If seqName, or a component of seqName, is in idLookup, then return the first + associated name; otherwise return the original name.""" + resultList = lookupSeqName(seqName, idLookup) + if (len(resultList)): + return resultList[0] + else: + return seqName +