fc7f1b9f5e16f4128f56760ac87a5649ee422b9d angie Thu Mar 26 15:07:25 2020 -0700 Add TreeTime date and administrative division (location) info for clades to nextstrainClade. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index e3cd141..373713d 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -59,63 +59,91 @@ snvEnds.append(int(m.group(2))) varNames.append(varName) if snvEnds: snvEnds.sort() snvStarts = list(map(lambda x: x-1, snvEnds)) snvSizes = list(map(lambda x: 1, snvEnds)) clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['color'] = cladeColorFromName(name) clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varStr return clade -def numDateToMonthDay(numDate): - """Transform decimal year timestamp to string with only month and day""" +def addDatesToClade(clade, numDateAttrs): + """Add the numeric dates from ncov.json node_attrs.num_date to clade record""" + clade['dateInferred'] = numDateAttrs['value'] + clade['dateConfMin'] = numDateAttrs['confidence'][0] + clade['dateConfMax'] = numDateAttrs['confidence'][1] + +def addDivisionToClade(clade, divisionAttrs): + """Add the administrative division (location) data from ncov.json node_attrs.division to clade""" + clade['divInferred'] = divisionAttrs['value'] + confString = '' + for div, conf in divisionAttrs['confidence'].items(): + if (len(confString)): + confString += ', ' + confString += "%s: %0.5f" % (div, conf) + clade['divConf'] = confString + +def numDateToYmd(numDate): + """Convert numeric date (decimal year) to integer year, month, day""" year = int(numDate) isLeapYear = 1 if (year % 4 == 0) else 0 # Get rid of the year numDate -= year # Convert to Julian day jDay = int(numDate * 365) + 1 if (jDay > 334 + isLeapYear): - monthDay ="Dec" + str(jDay - 334 - isLeapYear) + month, day = 11, (jDay - 334 - isLeapYear) elif (jDay > 304 + isLeapYear): - monthDay ="Nov" + str(jDay - 304 - isLeapYear) + month, day = 10, (jDay - 304 - isLeapYear) elif (jDay > 273 + isLeapYear): - monthDay ="Oct" + str(jDay - 273 - isLeapYear) + month, day = 9, (jDay - 273 - isLeapYear) elif (jDay > 243 + isLeapYear): - monthDay ="Sep" + str(jDay - 243 - isLeapYear) + month, day = 8, (jDay - 243 - isLeapYear) elif (jDay > 212 + isLeapYear): - monthDay ="Aug" + str(jDay - 212 - isLeapYear) + month, day = 7, (jDay - 212 - isLeapYear) elif (jDay > 181 + isLeapYear): - monthDay ="Jul" + str(jDay - 181 - isLeapYear) + month, day = 6, (jDay - 181 - isLeapYear) elif (jDay > 151 + isLeapYear): - monthDay ="Jun" + str(jDay - 151 - isLeapYear) + month, day = 5, (jDay - 151 - isLeapYear) elif (jDay > 120 + isLeapYear): - monthDay ="May" + str(jDay - 120 - isLeapYear) + month, day = 4, (jDay - 120 - isLeapYear) elif (jDay > 90 + isLeapYear): - monthDay ="Apr" + str(jDay - 90 - isLeapYear) + month, day = 3, (jDay - 90 - isLeapYear) elif (jDay > 59 + isLeapYear): - monthDay ="Mar" + str(jDay - 59 - isLeapYear) + month, day = 2, (jDay - 59 - isLeapYear) elif (jDay > 31): - monthDay ="Feb" + str(jDay - 31) + month, day = 1, (jDay - 31) else: - monthDay ="Jan" + str(jDay) - return monthDay + month, day = 0, jDay + return year, month, day + +months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']; + +def numDateToMonthDay(numDate): + """Transform decimal year timestamp to string with only month and day""" + year, month, day = numDateToYmd(numDate) + return months[month] + str(day) + +def numDateToYmdStr(numDate): + """Convert decimal year to YY-MM-DD string""" + year, month, day = numDateToYmd(numDate) + return "%02d-%02d-%02d" % (year, month+1, day) def rUnpackNextstrainTree(branch, parentVariants, parentVarStr): """Recursively descend ncov.tree and build data structures for genome browser tracks""" # Gather variants specific to this node/branch (if any) localVariants = [] if (branch.get('branch_attrs') and branch['branch_attrs'].get('mutations') and branch['branch_attrs']['mutations'].get('nuc')): # Nucleotide variants specific to this branch for varName in branch['branch_attrs']['mutations']['nuc']: if (snvRe.match(varName)): localVariants.append(varName) if (not variantCounts.get(varName)): variantCounts[varName] = 0; # Amino acid variants: figure out which nucleotide variant goes with each for geneName in branch['branch_attrs']['mutations'].keys(): @@ -144,46 +172,48 @@ # Add variants specific to this branch (if any) for varName in localVariants: branchVariants[varName] = 1 # Make an ordered variant string as David requested: semicolons between nodes, # comma-separated within a node: branchVarStr = '' for varName in localVariants: if (len(branchVarStr)): branchVarStr += ', ' branchVarStr += varName aaVar = variantAaChanges.get(varName) if (aaVar): branchVarStr += ' (' + aaVar + ')' if (len(parentVarStr) and len(branchVarStr)): branchVarStr = parentVarStr + '; ' + branchVarStr - if (branch['node_attrs'].get('clade_membership')): - cladeName = branch['node_attrs']['clade_membership']['value'] + nodeAttrs = branch['node_attrs'] + if (nodeAttrs.get('clade_membership')): + cladeName = nodeAttrs['clade_membership']['value'] if (not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr) + addDatesToClade(clades[cladeName], nodeAttrs['num_date']) + addDivisionToClade(clades[cladeName], nodeAttrs['division']) kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 - date = numDateToMonthDay(branch['node_attrs']['num_date']['value']) - samples.append({ 'id': branch['node_attrs']['gisaid_epi_isl']['value'], + samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'], 'name': branch['name'], - 'clade': branch['node_attrs']['clade_membership']['value'], - 'date': date, + 'clade': nodeAttrs['clade_membership']['value'], + 'date': numDateToMonthDay(nodeAttrs['num_date']['value']), 'variants': branchVariants }) if (cladeName): if (clades[cladeName].get('sampleCount')): clades[cladeName]['sampleCount'] += 1 else: clades[cladeName]['sampleCount'] = 1 if (clades[cladeName].get('samples')): clades[cladeName]['samples'].append(branch['name']) else: clades[cladeName]['samples'] = [ branch['name'] ] rUnpackNextstrainTree(ncov['tree'], {}, '') sampleCount = len(samples) sampleNames = [ ':'.join([sample['id'], sample['name'], sample['date']]) for sample in samples ] @@ -227,18 +257,23 @@ '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') outC.close() with open('nextstrainClade.bed', 'w') as outC: for name, clade in clades.items(): if (clade.get('thickStart')): outC.write('\t'.join(map(str, [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], clade['color'], len(clade['varSizes']) + 2, '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', clade['varNames'], + numDateToYmdStr(clade['dateInferred']), + numDateToYmdStr(clade['dateConfMin']), + numDateToYmdStr(clade['dateConfMax']), + clade['divInferred'], + clade['divConf'], clade['sampleCount'], ', '.join(clade['samples']) ])) + '\n') outC.close()