a113d7fcbb275fcfda2b51c2bc359900531a8c0d
angie
  Tue Jun 9 13:26:57 2020 -0700
Adding more tolerance for missing data in ncov.json so I can run on a dev build of Nextstrain.

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index 710773c..a49cef1 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -85,49 +85,52 @@
             del snvEnds[ix]
     if snvEnds:
         snvEnds.sort()
         snvStarts = [ e-1 for e in snvEnds ]
         snvSizes = [ 1 for e in snvEnds ]
         clade['thickStart'] = min(snvStarts)
         clade['thickEnd'] = max(snvEnds)
         clade['name'] = name
         clade['varSizes'] = snvSizes
         clade['varStarts'] = snvStarts
         clade['varNames'] = varStr
     return clade
 
 def addDatesToClade(clade, numDateAttrs):
     """Add the numeric dates from ncov.json node_attrs.num_date to clade record"""
+    if (numDateAttrs):
         clade['dateInferred'] = numDateAttrs['value']
         clade['dateConfMin'] = numDateAttrs['confidence'][0]
         clade['dateConfMax'] = numDateAttrs['confidence'][1]
+    else:
+        clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = ''
 
 def addCountryToClade(clade, countryAttrs):
     """Add country data from ncov.json node_attrs.country to clade"""
     clade['countryInferred'] = countryAttrs['value']
     conf = countryAttrs.get('confidence')
     clade['countryConf'] = ', '.join([ "%s: %0.5f" % (country, conf)
                                        for country, conf in conf.items()]) if conf else ''
 
 def processClade(branch, tag, branchVariants, branchVarStr, clades):
     """If this is the first time we've seen (old or new) clade, add it to clades"""
     nodeAttrs = branch['node_attrs']
     if (nodeAttrs.get(tag)):
         cladeName = nodeAttrs[tag]['value']
         if (cladeName != 'unassigned' and not cladeName in clades):
             clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr)
-            addDatesToClade(clades[cladeName], nodeAttrs['num_date'])
+            addDatesToClade(clades[cladeName], nodeAttrs.get('num_date'))
             if (nodeAttrs.get('country')):
                 addCountryToClade(clades[cladeName], nodeAttrs['country'])
             elif (nodeAttrs.get('division')):
                 addCountryToClade(clades[cladeName], nodeAttrs['division'])
             clades[cladeName]['topNode'] = branch
 
 def numDateToYmd(numDate):
     """Convert numeric date (decimal year) to integer year, month, day"""
     year = int(numDate)
     isLeapYear = 1 if (year % 4 == 0) else 0
     # Get rid of the year
     numDate -= year
     # Convert to Julian day
     daysInYear = 366 if isLeapYear else 365
     jDay = int(numDate * daysInYear) + 1
@@ -149,30 +152,32 @@
         month, day = 4, (jDay - 120 - isLeapYear)
     elif (jDay > 90 + isLeapYear):
         month, day = 3, (jDay - 90 - isLeapYear)
     elif (jDay > 59 + isLeapYear):
         month, day = 2, (jDay - 59 - isLeapYear)
     elif (jDay > 31):
         month, day = 1, (jDay - 31)
     else:
         month, day = 0, jDay
     return year, month, day
 
 months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'];
 
 def numDateToMonthDay(numDate):
     """Transform decimal year timestamp to string with only month and day"""
+    if not numDate:
+        return '?'
     year, month, day = numDateToYmd(numDate)
     return months[month] + str(day)
 
 def numDateToYmdStr(numDate):
     """Convert decimal year to YY-MM-DD string"""
     if numDate:
         year, month, day = numDateToYmd(numDate)
         return "%02d-%02d-%02d" % (year, month+1, day)
     else:
         return ''
 
 def rUnpackNextstrainTree(branch, parentVariants, parentVarStr):
     """Recursively descend ncov.tree and build data structures for genome browser tracks"""
     # Gather variants specific to this node/branch (if any)
     localVariants = []
@@ -229,35 +234,39 @@
     if (kids):
         for child in kids:
             rUnpackNextstrainTree(child, branchVariants, branchVarStr);
     else:
         for varName in branchVariants:
             variantCounts[varName] += 1
         nodeAttrs = branch['node_attrs']
         if (nodeAttrs.get('submitting_lab')):
             lab = nodeAttrs['submitting_lab']['value']
         else:
             lab = ''
         if (nodeAttrs.get('legacy_clade_membership')):
             oldClade = nodeAttrs['legacy_clade_membership']['value']
         else:
             oldClade = ''
-        samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'],
+        epiNode = nodeAttrs.get('gisaid_epi_isl')
+        epiId = epiNode['value'] if epiNode else branch['name']
+        numDateNode = nodeAttrs.get('num_date')
+        numDate = numDateNode['value'] if numDateNode else ''
+        samples.append({ 'id': epiId,
                          'name': branch['name'],
                          'clade': nodeAttrs['clade_membership']['value'],
                          'oldClade': oldClade,
-                         'date': numDateToMonthDay(nodeAttrs['num_date']['value']),
+                         'date': numDateToMonthDay(numDate),
                          'lab': lab,
                          'variants': branchVariants,
                          'varStr': branchVarStr })
 
 rUnpackNextstrainTree(ncov['tree'], {}, '')
 
 def sampleName(sample):
     return '|'.join([sample['id'], sample['name'], sample['date']])
 
 sampleCount = len(samples)
 sampleNames = [ sampleName(sample)  for sample in samples ]
 
 # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order
 parsedVars = []
 for varName in variantCounts.keys():
@@ -377,31 +386,32 @@
 vcfForAll('nextstrainSamples.vcf', 'clade')
 # I'll skip writing an enormous file with wasteful clades-in-genotypes... really need
 # a sample metadata file!
 
 # Assign samples to clades; a sample can appear in multiple clades if they are nested.
 
 def sampleIdsFromNode(node, cladeTops=()):
     """Return a list of IDs of all samples found under node."""
     kids = node.get('children')
     if (kids):
         sampleIds = []
         for kid in kids:
             if (kid not in cladeTops):
                 sampleIds += sampleIdsFromNode(kid, cladeTops)
     else:
-        sampleId = node['node_attrs']['gisaid_epi_isl']['value']
+        epiNode = node['node_attrs'].get('gisaid_epi_isl')
+        sampleId = epiNode['value'] if epiNode else node['name']
         sampleIds = [sampleId]
     return sampleIds
 
 cladeSampleCounts = {}
 cladeSampleNames = {}
 
 def vcfForClades(clades, cladeTops=()):
     """Given a set of clades (old or new), dump out VCF for each clade.
     Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully
     contains 20B and 20C, etc."""
     for cladeName in clades:
         node = clades[cladeName]['topNode']
         cladeSampleIds = set(sampleIdsFromNode(node, cladeTops))
         cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ]
         cladeSampleCounts[cladeName] = len(cladeSampleList)
@@ -514,33 +524,35 @@
         nodeAttrs = node['node_attrs']
         if (nodeAttrs.get('clade_membership')):
             cladeName = nodeAttrs['clade_membership']['value']
         elif (parentClade):
             cladeName = parentClade
         else:
             cladeName = 'unassigned'
         color = str(cladeRgbFromName(cladeName, cladeColors))
         descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName,
                                                      varStr)
                                  for child in kids if child not in cladeTops ])
         label = '#'.join([cladeName, varStr])
         treeString = '(' + descendants + ')' + label + ':' + color
     else:
         nodeAttrs = node['node_attrs']
-        gId = nodeAttrs['gisaid_epi_isl']['value']
+        epiNode = nodeAttrs.get('gisaid_epi_isl')
+        gId = epiNode['value'] if epiNode else node['name']
         name = node['name']
-        date = numDateToMonthDay(nodeAttrs['num_date']['value'])
+        numDateNode = nodeAttrs.get('num_date')
+        date = numDateNode['value'] if numDateNode else ''
         cladeName = nodeAttrs['clade_membership']['value']
         color = str(cladeRgbFromName(cladeName, cladeColors))
         treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color
     return treeString
 
 with open('nextstrain.nh', 'w') as outF:
     outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n')
 
 if (len(oldClades)):
     with open('nextstrainOldCladeColors.nh', 'w') as outF:
         outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n')
 
 def newickForClades(clades, cladeColors, cladeTops=()):
     for cladeName in clades:
         filename = 'nextstrain' + cladeName + '.nh'