39a6a0080ab48990a02c59d21a1566d25a0103fe angie Wed Mar 25 15:28:33 2020 -0700 Add sample date (just month and day; GISAID sample name ends in year) to sample labels in VCF (David request). refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index ceaad22..8a239b5 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -59,30 +59,64 @@ snvEnds.append(int(m.group(2))) varNames.append(varName) if snvEnds: snvEnds.sort() snvStarts = list(map(lambda x: x-1, snvEnds)) snvSizes = list(map(lambda x: 1, snvEnds)) clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['color'] = cladeColorFromName(name) clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varNames return clade +def numDateToMonthDay(numDate): + """Transform decimal year timestamp to string with only month and day""" + year = int(numDate) + isLeapYear = 1 if (year % 4 == 0) else 0 + # Get rid of the year + numDate -= year + # Convert to Julian day + jDay = int(numDate * 365) + 1 + if (jDay > 334 + isLeapYear): + monthDay ="Dec" + str(jDay - 334 - isLeapYear) + elif (jDay > 304 + isLeapYear): + monthDay ="Nov" + str(jDay - 304 - isLeapYear) + elif (jDay > 273 + isLeapYear): + monthDay ="Oct" + str(jDay - 273 - isLeapYear) + elif (jDay > 243 + isLeapYear): + monthDay ="Sep" + str(jDay - 243 - isLeapYear) + elif (jDay > 212 + isLeapYear): + monthDay ="Aug" + str(jDay - 212 - isLeapYear) + elif (jDay > 181 + isLeapYear): + monthDay ="Jul" + str(jDay - 181 - isLeapYear) + elif (jDay > 151 + isLeapYear): + monthDay ="Jun" + str(jDay - 151 - isLeapYear) + elif (jDay > 120 + isLeapYear): + monthDay ="May" + str(jDay - 120 - isLeapYear) + elif (jDay > 90 + isLeapYear): + monthDay ="Apr" + str(jDay - 90 - isLeapYear) + elif (jDay > 59 + isLeapYear): + monthDay ="Mar" + str(jDay - 59 - isLeapYear) + elif (jDay > 31): + monthDay ="Feb" + str(jDay - 31) + else: + monthDay ="Jan" + str(jDay) + return monthDay + def rUnpackNextstrainTree(branch, parentVariants): """Recursively descend ncov.tree and build data structures for genome browser tracks""" # Inherit parent variants branchVariants = parentVariants.copy() # Add variants specific to this branch (if any) try: # Nucleotide variants specific to this branch for varName in branch['branch_attrs']['mutations']['nuc']: if (snvRe.match(varName)): branchVariants[varName] = 1 if (not variantCounts.get(varName)): variantCounts[varName] = 0; # Amino acid variants specific to this branch for geneName in branch['branch_attrs']['mutations'].keys(): if (geneName != 'nuc'): @@ -106,48 +140,50 @@ else: warn("Can't match amino acid change" + change) except KeyError: pass if (branch['node_attrs'].get('clade_membership')): cladeName = branch['node_attrs']['clade_membership']['value'] if (not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants) kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants); else: for varName in branchVariants: variantCounts[varName] += 1 + date = numDateToMonthDay(branch['node_attrs']['num_date']['value']) samples.append({ 'id': branch['node_attrs']['gisaid_epi_isl']['value'], 'name': branch['name'], 'clade': branch['node_attrs']['clade_membership']['value'], + 'date': date, 'variants': branchVariants }) if (cladeName): if (clades[cladeName].get('sampleCount')): clades[cladeName]['sampleCount'] += 1 else: clades[cladeName]['sampleCount'] = 1 if (clades[cladeName].get('samples')): clades[cladeName]['samples'].append(branch['name']) else: clades[cladeName]['samples'] = [ branch['name'] ] rUnpackNextstrainTree(ncov['tree'], {}) sampleCount = len(samples) -sampleNames = [ sample['id'] + ':' + sample['name'] for sample in samples ] +sampleNames = [ ':'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ] # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) # Sort by position def parsedVarPos(pv): return pv[0] parsedVars.sort(key=parsedVarPos)