39a6a0080ab48990a02c59d21a1566d25a0103fe
angie
  Wed Mar 25 15:28:33 2020 -0700
Add sample date (just month and day; GISAID sample name ends in year) to sample labels in VCF (David request).  refs #25188

diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py
index ceaad22..8a239b5 100755
--- src/hg/utils/otto/nextstrainNcov/nextstrain.py
+++ src/hg/utils/otto/nextstrainNcov/nextstrain.py
@@ -59,30 +59,64 @@
             snvEnds.append(int(m.group(2)))
             varNames.append(varName)
     if snvEnds:
         snvEnds.sort()
         snvStarts = list(map(lambda x: x-1, snvEnds))
         snvSizes = list(map(lambda x: 1, snvEnds))
         clade['thickStart'] = min(snvStarts)
         clade['thickEnd'] = max(snvEnds)
         clade['name'] = name
         clade['color'] = cladeColorFromName(name)
         clade['varSizes'] = snvSizes
         clade['varStarts'] = snvStarts
         clade['varNames'] = varNames
     return clade
 
+def numDateToMonthDay(numDate):
+    """Transform decimal year timestamp to string with only month and day"""
+    year = int(numDate)
+    isLeapYear = 1 if (year % 4 == 0) else 0
+    # Get rid of the year
+    numDate -= year
+    # Convert to Julian day
+    jDay = int(numDate * 365) + 1
+    if (jDay > 334 + isLeapYear):
+        monthDay ="Dec" + str(jDay - 334 - isLeapYear)
+    elif (jDay > 304 + isLeapYear):
+        monthDay ="Nov" + str(jDay - 304 - isLeapYear)
+    elif (jDay > 273 + isLeapYear):
+        monthDay ="Oct" + str(jDay - 273 - isLeapYear)
+    elif (jDay > 243 + isLeapYear):
+        monthDay ="Sep" + str(jDay - 243 - isLeapYear)
+    elif (jDay > 212 + isLeapYear):
+        monthDay ="Aug" + str(jDay - 212 - isLeapYear)
+    elif (jDay > 181 + isLeapYear):
+        monthDay ="Jul" + str(jDay - 181 - isLeapYear)
+    elif (jDay > 151 + isLeapYear):
+        monthDay ="Jun" + str(jDay - 151 - isLeapYear)
+    elif (jDay > 120 + isLeapYear):
+        monthDay ="May" + str(jDay - 120 - isLeapYear)
+    elif (jDay > 90 + isLeapYear):
+        monthDay ="Apr" + str(jDay - 90 - isLeapYear)
+    elif (jDay > 59 + isLeapYear):
+        monthDay ="Mar" + str(jDay - 59 - isLeapYear)
+    elif (jDay > 31):
+        monthDay ="Feb" + str(jDay - 31)
+    else:
+        monthDay ="Jan" + str(jDay)
+    return monthDay
+
 def rUnpackNextstrainTree(branch, parentVariants):
     """Recursively descend ncov.tree and build data structures for genome browser tracks"""
     # Inherit parent variants
     branchVariants = parentVariants.copy()
     # Add variants specific to this branch (if any)
     try:
         # Nucleotide variants specific to this branch
         for varName in branch['branch_attrs']['mutations']['nuc']:
             if (snvRe.match(varName)):
                 branchVariants[varName] = 1
                 if (not variantCounts.get(varName)):
                     variantCounts[varName] = 0;
         # Amino acid variants specific to this branch
         for geneName in branch['branch_attrs']['mutations'].keys():
             if (geneName != 'nuc'):
@@ -106,48 +140,50 @@
                     else:
                         warn("Can't match amino acid change" + change)
     except KeyError:
         pass
     if (branch['node_attrs'].get('clade_membership')):
         cladeName = branch['node_attrs']['clade_membership']['value']
         if (not cladeName in clades):
             clades[cladeName] = cladeFromVariants(cladeName, branchVariants)
     kids = branch.get('children')
     if (kids):
         for child in kids:
             rUnpackNextstrainTree(child, branchVariants);
     else:
         for varName in branchVariants:
             variantCounts[varName] += 1
+        date = numDateToMonthDay(branch['node_attrs']['num_date']['value'])
         samples.append({ 'id': branch['node_attrs']['gisaid_epi_isl']['value'],
                          'name': branch['name'],
                          'clade': branch['node_attrs']['clade_membership']['value'],
+                         'date': date,
                          'variants': branchVariants })
         if (cladeName):
             if (clades[cladeName].get('sampleCount')):
                 clades[cladeName]['sampleCount'] += 1
             else:
                 clades[cladeName]['sampleCount'] = 1
             if (clades[cladeName].get('samples')):
                 clades[cladeName]['samples'].append(branch['name'])
             else:
                 clades[cladeName]['samples'] = [ branch['name'] ]
 
 rUnpackNextstrainTree(ncov['tree'], {})
 
 sampleCount = len(samples)
-sampleNames = [ sample['id'] + ':' + sample['name'] for sample in samples ]
+sampleNames = [ ':'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ]
 
 # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order
 parsedVars = []
 for varName in variantCounts.keys():
     m = snvRe.match(varName)
     if (m):
         ref, pos, alt = m.groups()
         parsedVars.append([int(pos), varName, ref, alt])
     else:
         warn("Can't match " + varName)
 # Sort by position
 def parsedVarPos(pv):
     return pv[0]
 parsedVars.sort(key=parsedVarPos)