src/hg/utils/otto/nextstrainNcov/nextstrain.py abf5dba0ce1f1cf2d3f15624272bde0854aec681

abf5dba0ce1f1cf2d3f15624272bde0854aec681
angie
  Sat Jun 20 14:24:44 2020 -0700
Don't ignore the 'mutations' that are changes between [ACGT] and '-' (missing info), because they can occasionally happen in the path of a nucleotide change, for example T28144- to -T28144C.  fixes #25751

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index a49cef1..620af5b 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -14,32 +14,32 @@
 genePos = {}
 geneBed = []
 for gene,anno in ncov['meta']['genome_annotations'].items():
     if (gene != 'nuc'):
         genePos[gene] = anno['start'] - 1
         geneBed.append([chrom, anno['start']-1, anno['end'], gene])
 def bedStart(bed):
     return bed[1]
 geneBed.sort(key=bedStart)
 with open('nextstrainGene.bed', 'w') as outG:
     for bed in geneBed:
         outG.write('\t'.join(map(str, bed)) + '\n')
 
 # Variants and "clades"
 
-snvRe = re.compile('^([ACGT])([0-9]+)([ACGT])$')
-snvAaRe = re.compile('^([A-Z*])([0-9]+)([A-Z*])$')
+snvRe = re.compile('^([ACGT-])([0-9]+)([ACGT-])$')
+snvAaRe = re.compile('^([A-Z*-])([0-9]+)([A-Z*-])$')
 
 newClades = {}
 oldClades = {}
 variantCounts = {}
 variantAaChanges = {}
 samples = []
 
 # Clades from March 15th, 2020 to early morning of June 2nd, 2020:
 oldCladeColors = { 'A1a': '73,75,225', 'A2': '75,131,233', 'A2a': '92,173,207',
                    'A3': '119,199,164', 'A6': '154,212,122', 'A7': '173,189,81',
                    'B': '233,205,74', 'B1': '255,176,65', 'B2': '255,122,53',
                    'B4': '249,53,41' }
 
 # Clades from late morning of June 2nd, 2020:
 newCladeColors = { '19A': '76,135,232',
@@ -67,30 +67,32 @@
         m = snvRe.match(varName)
         if (m):
             ref, pos, alt = m.groups()
             prevMut = changesByPos[pos]
             if (prevMut):
                 # If multi-allelic, leave the position in the list; but if back-mutation,
                 # remove the original mutation.  In either case, don't add this pos again.
                 prevIx, prevRef, prevAlt = prevMut
                 if (prevAlt == ref and prevRef == alt):
                     ixsToRemove.append(prevIx)
                     changesByPos[pos] = []
             else:
                 ix = len(snvEnds)
                 changesByPos[pos] = (ix, ref, alt)
                 snvEnds.append(int(pos))
+        else:
+            warn("cladeFromVariants: no match for SNV '%s'" % (varName))
     if ixsToRemove:
         ixsToRemove.sort(reverse=True)
         for ix in ixsToRemove:
             del snvEnds[ix]
     if snvEnds:
         snvEnds.sort()
         snvStarts = [ e-1 for e in snvEnds ]
         snvSizes = [ 1 for e in snvEnds ]
         clade['thickStart'] = min(snvStarts)
         clade['thickEnd'] = max(snvEnds)
         clade['name'] = name
         clade['varSizes'] = snvSizes
         clade['varStarts'] = snvStarts
         clade['varNames'] = varStr
     return clade
@@ -177,30 +179,32 @@
     else:
         return ''
 
 def rUnpackNextstrainTree(branch, parentVariants, parentVarStr):
     """Recursively descend ncov.tree and build data structures for genome browser tracks"""
     # Gather variants specific to this node/branch (if any)
     localVariants = []
     if (branch.get('branch_attrs') and branch['branch_attrs'].get('mutations') and
         branch['branch_attrs']['mutations'].get('nuc')):
         # Nucleotide variants specific to this branch
         for varName in branch['branch_attrs']['mutations']['nuc']:
             if (snvRe.match(varName)):
                 localVariants.append(varName)
                 if (not variantCounts.get(varName)):
                     variantCounts[varName] = 0;
+            else:
+                warn("rUnpackNextstrainTree: no snvRe match for '%s'" % (varName))
         # Amino acid variants: figure out which nucleotide variant goes with each
         for geneName in branch['branch_attrs']['mutations'].keys():
             if (geneName != 'nuc'):
                 for change in branch['branch_attrs']['mutations'][geneName]:
                     # Get nucleotide coords & figure out which nuc var this aa change corresponds to
                     aaM = snvAaRe.match(change)
                     if (aaM):
                         aaRef, aaPos, aaAlt = aaM.groups()
                         varStartMin = (int(aaPos) - 1) * 3
                         if (genePos.get(geneName)):
                             cdsStart = genePos.get(geneName)
                             varStartMin += cdsStart
                             varStartMax = varStartMin + 2
                             for varName in localVariants:
                                 ref, pos, alt = snvRe.match(varName).groups()
@@ -273,79 +277,93 @@
     m = snvRe.match(varName)
     if (m):
         ref, pos, alt = m.groups()
         parsedVars.append([int(pos), varName, ref, alt])
     else:
         warn("Can't match " + varName)
 # Sort by position
 def parsedVarPos(pv):
     return pv[0]
 parsedVars.sort(key=parsedVarPos)
 
 def parsedVarAlleleCount(pv):
     return variantCounts[pv[1]]
 
 def mergeVariants(pvList):
-    """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual allele at each sample, handling back-mutations and serial mutations."""
+    """Given a list of parsedVars [pos, varName, ref, alt] at the same pos, resolve the actual
+    allele at each sample, handling back-mutations and serial mutations."""
     # Sort by descending allele count, assuming that the ref allele of the most frequently
     # observed variant is the true ref allele.  For back-muts, ref and alt are swapped;
     # for serial muts, alt of the first is ref of the second.
     pvList.sort(key=parsedVarAlleleCount)
     pvList.reverse()
     pos, varName, trueRef, firstAlt = pvList[0]
     alts = []
     sampleAlleles = [ 0 for sample in samples ]
     backMutSamples = []
     for pv in pvList:
         thisPos, thisName, thisRef, thisAlt = pv
         if (thisPos != pos):
             warn("mergeVariants: inconsistent pos " + pos + " and " + thisPos)
         if (thisAlt == trueRef):
             # Back-mutation, not a true alt
             alIx = 0
+        elif (thisAlt == '-'):
+            # No-call, not a true alt
+            alIx = -1
         else:
             # Add to list of alts - unless it's an alt we've already seen, but from a different
             # serial mutation.  For example, there might be T>A but also T>G+G>A; don't add A twice.
             if (not thisAlt in alts):
                 alts.append(thisAlt)
                 if (thisName != varName):
                     varName += "," + thisName
             alIx = alts.index(thisAlt) + 1
         for ix, sample in enumerate(samples):
             if (sample['variants'].get(thisName)):
                 sampleAlleles[ix] = alIx
-                if (alIx == 0):
+                if (alIx == 0 and thisRef != '-'):
                     backMutSamples.append(sampleName(sample))
     # After handling back- and serial mutations, figure out true counts of each alt allele:
     altCounts = [ 0 for alt in alts ]
     for alIx in sampleAlleles:
         if (alIx > 0):
             altCounts[alIx - 1] += 1
+    startDash = trueRef + str(pos) + '-,'
+    dashToDash = startDash + '-'
+    if varName.startswith(dashToDash):
+        varName = trueRef + varName[len(dashToDash):]
+    elif varName.startswith(startDash):
+        varName = varName[len(startDash):]
     return [ [pos, varName, trueRef, ','.join(alts)],
              alts, altCounts, sampleAlleles, backMutSamples ]
 
 mergedVars = []
 
 variantsAtPos = []
 for pv in parsedVars:
     pos = pv[0]
     if (len(variantsAtPos) == 0 or pos == variantsAtPos[0][0]):
         variantsAtPos.append(pv)
     else:
-        mergedVars.append(mergeVariants(variantsAtPos))
+        mv = mergeVariants(variantsAtPos)
+        if (mv[1]):
+            mergedVars.append(mv)
         variantsAtPos = [pv]
-mergedVars.append(mergeVariants(variantsAtPos))
+mv = mergeVariants(variantsAtPos)
+if (mv[1]):
+    mergedVars.append(mv)
 
 def writeVcfHeaderExceptSamples(outF):
     """Write VCF header lines -- except for sample names (this ends with a \t not a \n)."""
     outF.write('##fileformat=VCFv4.3\n')
     outF.write('##source=nextstrain.org\n')
     outF.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) +
                '\t');
 
 def tallyAaChanges(varNameMerged):
     """If any of the merged variants cause a coding change, then produce a comma-sep string in same order as variants with corresponding change(s) or '-' if a variant does not cause a coding change.  If none of the variants cause a coding change, return the empty string."""
     varNames = varNameMerged.split(',')
     gotAaChange = 0
     aaChanges = []
     for varName in varNames:
         aaChange = variantAaChanges.get(varName)
@@ -364,31 +382,31 @@
 def vcfForAll(fileName, cladeTag):
     with open(fileName, 'w') as outC:
         writeVcfHeaderExceptSamples(outC)
         outC.write('\t'.join(sampleNames) + '\n')
         for mv in mergedVars:
             pv, alts, altCounts, sampleAlleles, backMutSamples = mv
             info = 'AC=' + ','.join(map(str, altCounts)) + ';AN=' + str(sampleCount)
             varNameMerged = pv[1]
             aaChange = tallyAaChanges(varNameMerged)
             if (len(aaChange)):
                 info += ';AACHANGE=' + aaChange
             if (len(backMutSamples)):
                 info += ';BACKMUTS=' + ','.join(backMutSamples)
             genotypes = []
             for sample, alIx in zip(samples, sampleAlleles):
-                gt = str(alIx)
+                gt = '.' if alIx < 0 else str(alIx)
                 genotypes.append(gt + ':' + sample[cladeTag])
             outC.write('\t'.join([ chrom,
                                    '\t'.join(map(str, pv)),
                                    '\t'.join(['.', 'PASS', info, 'GT:CLADE']),
                                    '\t'.join(genotypes) ]) + '\n')
 
 vcfForAll('nextstrainSamples.vcf', 'clade')
 # I'll skip writing an enormous file with wasteful clades-in-genotypes... really need
 # a sample metadata file!
 
 # Assign samples to clades; a sample can appear in multiple clades if they are nested.
 
 def sampleIdsFromNode(node, cladeTops=()):
     """Return a list of IDs of all samples found under node."""
     kids = node.get('children')
@@ -415,31 +433,31 @@
         cladeSampleIds = set(sampleIdsFromNode(node, cladeTops))
         cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ]
         cladeSampleCounts[cladeName] = len(cladeSampleList)
         cladeSampleNames[cladeName] = [ sampleName(sample) for sample in cladeSampleList ]
         with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV:
             writeVcfHeaderExceptSamples(outV)
             outV.write('\t'.join(cladeSampleNames[cladeName]) + '\n')
             for mv in mergedVars:
                 pv, alts, overallAltCounts, sampleAlleles, backMutSamples = mv
                 varNameMerged = pv[1]
                 genotypes = []
                 altCounts = [ 0 for alt in alts ]
                 acTotal=0
                 for sample, alIx in zip(samples, sampleAlleles):
                     if (sample['id'] in cladeSampleIds):
-                        gt = str(alIx)
+                        gt = '.' if (alIx < 0) else str(alIx)
                         genotypes.append(gt + ':' + cladeName)
                         if (alIx > 0):
                             altCounts[alIx - 1] += 1
                             acTotal += 1
                 if (acTotal > 0):
                     info = 'AC=' + ','.join(map(str, altCounts))
                     info += ';AN=' + str(cladeSampleCounts[cladeName])
                     aaChange = tallyAaChanges(varNameMerged)
                     if (len(aaChange)):
                         info += ';AACHANGE=' + aaChange
                     cladeBackMuts = [ sampleName for sampleName in backMutSamples
                                       if sampleName in cladeSampleNames[cladeName] ]
                     if (len(cladeBackMuts)):
                         info += ';BACKMUTS=' + ','.join(cladeBackMuts)
                     outV.write('\t'.join([ chrom,
@@ -490,69 +508,115 @@
     newCladeTops = [ newClades[cladeName]['topNode'] for cladeName in newClades ]
 vcfForClades(newClades, newCladeTops)
 bedForClades('nextstrainClade.bed', newClades, newCladeColors)
 if (len(oldClades)):
     vcfForClades(oldClades)
     bedForClades('nextstrainOldClade.bed', oldClades, oldCladeColors)
 
 # Newick-formatted tree of samples for VCF display
 def cladeRgbFromName(cladeName, cladeColors):
     """Look up the r,g,b string color for clade; convert to int RGB."""
     rgbCommaStr = cladeColorFromName(cladeName, cladeColors)
     r, g, b = [ int(x) for x in rgbCommaStr.split(',') ]
     rgb = (r << 16) | (g << 8) | b
     return rgb
 
+def resolveVarStrDashMuts(varStr):
+    """If there is a sequence of mutations like T28144- followed by -28144C, get rid of the T>-
+    and replace the ->C with T>C.  Get rid of solo basePos- because they can occur in very long
+    lists when samples have long stretches of Ns, not helpful."""
+    if (not varStr):
+        return varStr
+    nodes = [ node.split('+') for node in varStr.split('$') ]
+    posWithDash = defaultdict(list)
+    for node in nodes:
+        for varName in node:
+            (ref, pos, alt) = snvRe.match(varName).groups()
+            if ref == '-' or alt == '-':
+                posWithDash[pos].append([ref, alt])
+    if (len(posWithDash)):
+        removals = set()
+        replacements = {}
+        for pos, changes in posWithDash.items():
+            firstRef, firstAlt = changes[0]
+            if (len(changes) == 1):
+                if (firstAlt == '-'):
+                    removals.add(firstRef + pos + firstAlt)
+            elif (len(changes) == 2):
+                secondRef, secondAlt = changes[1]
+                if (firstAlt == '-' and secondRef == '-'):
+                    removals.add(firstRef + pos + firstAlt)
+                    replacements[secondRef + pos + secondAlt] = firstRef + pos + secondAlt
+                else:
+                    warn("resolveVarStrDashMuts: what do I do with %s%s%s, %s%s%s in %s?" %
+                         (firstRef, pos, firstAlt, secondRef, pos, secondAlt, varStr))
+            else:
+                warn("resolveVarStrDashMuts: more changes at %s than expected: %s" %
+                     (pos, varStr))
+        if (removals or replacements):
+            newNodes = []
+            for node in nodes:
+                newVars = []
+                for varName in node:
+                    if varName in replacements:
+                        newVars.append(replacements[varName])
+                    elif varName not in removals:
+                        newVars.append(varName)
+                if (newVars):
+                    newNodes.append(newVars)
+            varStr = '$'.join([ '+'.join(node) for node in newNodes ])
+    return varStr
+
 def rNextstrainToNewick(node, cladeColors, cladeTops=(), parentClade=None, parentVarStr=''):
     """Recursively descend ncov.tree and build Newick tree string of samples to file.
     Exclude nodes in cladeTops."""
     kids = node.get('children')
     if (kids):
         # Make a more concise variant path string than the one we make for the clade track,
         # to embed in internal node labels for Yatish's tree explorations.
         localVariants = []
         if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
             node['branch_attrs']['mutations'].get('nuc')):
             # Nucleotide variants specific to this branch
             for varName in node['branch_attrs']['mutations']['nuc']:
                 if (snvRe.match(varName)):
                     localVariants.append(varName)
         varStr = '+'.join(localVariants)
         if (len(parentVarStr) and len(varStr)):
             varStr = '$'.join([parentVarStr, varStr])
         elif (not len(varStr)):
             varStr = parentVarStr
         nodeAttrs = node['node_attrs']
         if (nodeAttrs.get('clade_membership')):
             cladeName = nodeAttrs['clade_membership']['value']
         elif (parentClade):
             cladeName = parentClade
         else:
             cladeName = 'unassigned'
         color = str(cladeRgbFromName(cladeName, cladeColors))
         descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName,
                                                      varStr)
                                  for child in kids if child not in cladeTops ])
-        label = '#'.join([cladeName, varStr])
+        label = '#'.join([cladeName, resolveVarStrDashMuts(varStr)])
         treeString = '(' + descendants + ')' + label + ':' + color
     else:
         nodeAttrs = node['node_attrs']
         epiNode = nodeAttrs.get('gisaid_epi_isl')
         gId = epiNode['value'] if epiNode else node['name']
         name = node['name']
         numDateNode = nodeAttrs.get('num_date')
-        date = numDateNode['value'] if numDateNode else ''
+        date = numDateToMonthDay(numDateNode['value'] if numDateNode else '')
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName, cladeColors))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n')
 
 if (len(oldClades)):
     with open('nextstrainOldCladeColors.nh', 'w') as outF:
         outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n')
 
 def newickForClades(clades, cladeColors, cladeTops=()):
     for cladeName in clades:
         filename = 'nextstrain' + cladeName + '.nh'
@@ -627,30 +691,32 @@
 def refAltCosts(node, pos, ref, alt, parentValue):
     nodeValue = parentValue
     mutCount = 0
     # If there are mutations on this branch, and one of them is at pos and is ref or alt
     # (i.e. not a discarded allele that we're ignoring), then change nodeValue.
     if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
         node['branch_attrs']['mutations'].get('nuc')):
         for varName in node['branch_attrs']['mutations']['nuc']:
             m = snvRe.match(varName)
             if (m):
                 mRef, mPos, mAlt = m.groups()
                 mPos = int(mPos)
                 if (mPos == pos and (mAlt == ref or mAlt == alt)):
                     nodeValue = mAlt
                     mutCount += 1
+            else:
+                warn("refAltCosts: no snvRe match for '%s'" % (varName))
     kids = node.get('children')
     if (kids):
         refCost, altCost = [0, 0]
         for kid in kids:
             kidRefCost, kidAltCost, kidMutCount = refAltCosts(kid, pos, ref, alt, nodeValue)
             refCost += min(kidRefCost, kidAltCost+1)
             altCost += min(kidRefCost+1, kidAltCost)
             mutCount += kidMutCount
     else:
         refCost = 0 if (nodeValue == ref) else 1
         altCost = 1 if (nodeValue == ref) else 0
     if ((refCost < altCost and nodeValue == alt) or
         (altCost < refCost and nodeValue == ref)):
         anyDiscordantVariants.append('\t'.join([ref + str(pos) + alt, node['name']]))
     return refCost, altCost, mutCount
@@ -704,20 +770,20 @@
         if (variantCounts.get(backMutVarName)):
             variants.append([ pos, backMutVarName, alt, ref ])
         pv, alts, altCounts, sampleAlleles, backMutSamples = mergeVariants(variants)
         if (len(alts) != 1):
             warn('Expected exactly one alt from merging ' + varName + ' and ' + backMutVarName +
                  ', but got [' + ', '.join(alts) + ']')
         info = 'AC=' + str(altCounts[0])
         info += ';AN=' + str(sampleCount)
         aaChange = tallyAaChanges(varName)
         if (len(aaChange)):
             info += ';AACHANGE=' + aaChange
         if (len(backMutSamples)):
             info += ';BACKMUTS=' + ','.join(backMutSamples)
         genotypes = []
         for sample, alIx in zip(samples, sampleAlleles):
-            gt = str(alIx)
+            gt = '.' if (alIx < 0) else str(alIx)
             genotypes.append(gt + ':' + sample['clade'])
         outF.write('\t'.join([ '\t'.join([ chrom, str(pos), varName, ref, alt,
                                            '.', 'PASS', info, 'GT:CLADE']),
                                '\t'.join(genotypes) ]) + '\n')