b22c1c39b539a7e6167cf41983d09ce6cbf243e8
angie
  Mon Apr 20 21:16:43 2020 -0700
Add submitting_lab to .varPaths file.  Compute tree parsimony score for each recurrent biallelic variant for David.  refs #25188

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index 85347af..8269bd9 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -200,30 +200,31 @@
                 addCountryToClade(clades[cladeName], nodeAttrs['division'])
             else:
                 warn('No country or division for new clade ' + cladeName)
             cladeNodes[cladeName] = branch
     kids = branch.get('children')
     if (kids):
         for child in kids:
             rUnpackNextstrainTree(child, branchVariants, branchVarStr);
     else:
         for varName in branchVariants:
             variantCounts[varName] += 1
         samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'],
                          'name': branch['name'],
                          'clade': nodeAttrs['clade_membership']['value'],
                          'date': numDateToMonthDay(nodeAttrs['num_date']['value']),
+                         'lab': nodeAttrs['submitting_lab']['value'],
                          'variants': branchVariants,
                          'varStr': branchVarStr })
 
 rUnpackNextstrainTree(ncov['tree'], {}, '')
 
 def sampleName(sample):
     return '|'.join([sample['id'], sample['name'], sample['date']])
 
 sampleCount = len(samples)
 sampleNames = [ sampleName(sample)  for sample in samples ]
 
 # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order
 parsedVars = []
 for varName in variantCounts.keys():
     m = snvRe.match(varName)
@@ -446,20 +447,148 @@
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree']) + ';\n')
     outF.close()
 
 for cladeName, node in cladeNodes.items():
     filename = 'nextstrain' + cladeName + '.nh'
     with open(filename, 'w') as outF:
         outF.write(rNextstrainToNewick(node) + ';\n')
         outF.close()
 
-# File with samples and their variant paths
+# File with samples and their clades, labs and variant paths
+
+apostropheSRe = re.compile("'s");
+firstLetterRe = re.compile('(\w)\w+');
+spacePunctRe = re.compile('\W');
+
+def abbreviateLab(lab):
+    """Lab names are very long and sometimes differ by punctuation or typos.  Abbreviate for easier comparison."""
+    labAbbrev = apostropheSRe.sub('', lab)
+    labAbbrev = firstLetterRe.sub(r'\1', labAbbrev, count=0)
+    labAbbrev = spacePunctRe.sub('', labAbbrev, count=0)
+    return labAbbrev
+
 with open('nextstrainSamples.varPaths', 'w') as outF:
     for sample in samples:
-        outF.write('\t'.join([sampleName(sample), sample['clade'], sample['varStr']]) + '\n');
+        lab = sample['lab']
+        labAbbrev = abbreviateLab(lab)
+        outF.write('\t'.join([sampleName(sample), sample['clade'], labAbbrev, lab,
+                              sample['varStr']]) + '\n');
+    outF.close()
+
+# Narrow down variants to "informative" set (bi-allelic, each allele supported by
+# sufficient number of samples):
+minSamples = 2
+discardedAlleles = []
+blacklist = []
+informativeVariants = []
+
+for mv in mergedVars:
+    pv, alts, altCounts, sampleAlleles, backMutSamples = mv
+    pos, varNameMerged, ref, altStr = pv
+    recurrentAlts = []
+    for alt, altCount in zip(alts, altCounts):
+        if (altCount < minSamples):
+            discardedAlleles.append([ chrom, pos-1, pos, ref + str(pos) + alt ])
+        else:
+            recurrentAlts.append(alt)
+    if (len(recurrentAlts) > 1):
+        multiRecurrentName = ','.join([ ref + str(pos) + alt for alt in recurrentAlts ])
+        blacklist.append([ chrom, pos-1, pos, multiRecurrentName ])
+    elif (len(recurrentAlts) == 1):
+        informativeVariants.append([ pos, ref, recurrentAlts[0] ])
+
+# Dump out BED files for the categories:
+with open('nextstrainDiscarded.bed', 'w') as outF:
+    for da in discardedAlleles:
+        outF.write('\t'.join(map(str, da)) + '\n');
+    outF.close()
+
+with open('nextstrainBlacklisted.bed', 'w') as outF:
+    for bl in blacklist:
+        outF.write('\t'.join(map(str, bl)) + '\n');
+    outF.close()
+
+with open('nextstrainInformative.bed', 'w') as outF:
+    for iv in informativeVariants:
+        pos, ref, alt = iv
+        outF.write('\t'.join(map(str, [ chrom, pos-1, pos, ref + str(pos) + alt ])) + '\n')
+    outF.close()
+
+# Compute parsimony score for tree at each informative variant.
+
+anyDiscordantVariants = []
+
+def refAltCosts(node, pos, ref, alt, parentValue):
+    nodeValue = parentValue
+    mutCount = 0
+    # If there are mutations on this branch, and one of them is at pos and is ref or alt
+    # (i.e. not a discarded allele that we're ignoring), then change nodeValue.
+    if (node.get('branch_attrs') and node['branch_attrs'].get('mutations') and
+        node['branch_attrs']['mutations'].get('nuc')):
+        for varName in node['branch_attrs']['mutations']['nuc']:
+            m = snvRe.match(varName)
+            if (m):
+                mRef, mPos, mAlt = m.groups()
+                mPos = int(mPos)
+                if (mPos == pos and (mAlt == ref or mAlt == alt)):
+                    nodeValue = mAlt
+                    mutCount += 1
+    kids = node.get('children')
+    if (kids):
+        refCost, altCost = [0, 0]
+        for kid in kids:
+            kidRefCost, kidAltCost, kidMutCount = refAltCosts(kid, pos, ref, alt, nodeValue)
+            refCost += min(kidRefCost, kidAltCost+1)
+            altCost += min(kidRefCost+1, kidAltCost)
+            mutCount += kidMutCount
+    else:
+        refCost = 0 if (nodeValue == ref) else 1
+        altCost = 1 if (nodeValue == ref) else 0
+    if ((refCost < altCost and nodeValue == alt) or
+        (altCost < refCost and nodeValue == ref)):
+        anyDiscordantVariants.append('\t'.join([ref + str(pos) + alt, node['name']]))
+    return refCost, altCost, mutCount
+
+parsimonyScores = []
+mutationCounts = []
+rootDiscordantVariants = []
+
+for iv in informativeVariants:
+    pos, ref, alt = iv
+    varName = ref + str(pos) + alt
+    rootRefCost, rootAltCost, mutCount = refAltCosts(ncov['tree'], pos, ref, alt, ref)
+    parsimonyScores.append([pos, min(rootRefCost, rootAltCost)])
+    mutationCounts.append([pos, mutCount])
+    rootLabel = 0 if (rootRefCost <= rootAltCost) else 1
+    if (rootLabel != 0):
+        # Note: so far this has not happened.  Seems like it would be pretty lame if it did.
+        rootDiscordantVariants.append([chrom, pos-1, pos, varName])
+
+# Write out files with discordant nodes, parsimony/mutation counts (identical!):
+with open('nextstrainRootDiscordant.bed', 'w') as outF:
+    for dv in rootDiscordantVariants:
+        outF.write('\t'.join(dv) + '\n');
+    outF.close();
+
+with open('nextstrainAnyDiscordant.txt', 'w') as outF:
+    for varName in anyDiscordantVariants:
+        outF.write(varName + '\n');
+    outF.close()
+
+# bedGraph for parsimony scores and mutation counts
+with open('nextstrainParsimony.bedGraph', 'w') as outF:
+    for ps in parsimonyScores:
+        pos, score = ps
+        outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
+    outF.close()
+
+with open('nextstrainMutCounts.bedGraph', 'w') as outF:
+    for ps in mutationCounts:
+        pos, score = ps
+        outF.write('\t'.join(map(str, [ chrom, pos-1, pos, score ])) + '\n')
     outF.close()