a113d7fcbb275fcfda2b51c2bc359900531a8c0d angie Tue Jun 9 13:26:57 2020 -0700 Adding more tolerance for missing data in ncov.json so I can run on a dev build of Nextstrain. diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 710773c..a49cef1 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -85,49 +85,52 @@ del snvEnds[ix] if snvEnds: snvEnds.sort() snvStarts = [ e-1 for e in snvEnds ] snvSizes = [ 1 for e in snvEnds ] clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varStr return clade def addDatesToClade(clade, numDateAttrs): """Add the numeric dates from ncov.json node_attrs.num_date to clade record""" + if (numDateAttrs): clade['dateInferred'] = numDateAttrs['value'] clade['dateConfMin'] = numDateAttrs['confidence'][0] clade['dateConfMax'] = numDateAttrs['confidence'][1] + else: + clade['dateInferred'] = clade['dateConfMin'] = clade['dateConfMax'] = '' def addCountryToClade(clade, countryAttrs): """Add country data from ncov.json node_attrs.country to clade""" clade['countryInferred'] = countryAttrs['value'] conf = countryAttrs.get('confidence') clade['countryConf'] = ', '.join([ "%s: %0.5f" % (country, conf) for country, conf in conf.items()]) if conf else '' def processClade(branch, tag, branchVariants, branchVarStr, clades): """If this is the first time we've seen (old or new) clade, add it to clades""" nodeAttrs = branch['node_attrs'] if (nodeAttrs.get(tag)): cladeName = nodeAttrs[tag]['value'] if (cladeName != 'unassigned' and not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr) - addDatesToClade(clades[cladeName], nodeAttrs['num_date']) + addDatesToClade(clades[cladeName], nodeAttrs.get('num_date')) if (nodeAttrs.get('country')): addCountryToClade(clades[cladeName], nodeAttrs['country']) elif (nodeAttrs.get('division')): addCountryToClade(clades[cladeName], nodeAttrs['division']) clades[cladeName]['topNode'] = branch def numDateToYmd(numDate): """Convert numeric date (decimal year) to integer year, month, day""" year = int(numDate) isLeapYear = 1 if (year % 4 == 0) else 0 # Get rid of the year numDate -= year # Convert to Julian day daysInYear = 366 if isLeapYear else 365 jDay = int(numDate * daysInYear) + 1 @@ -149,30 +152,32 @@ month, day = 4, (jDay - 120 - isLeapYear) elif (jDay > 90 + isLeapYear): month, day = 3, (jDay - 90 - isLeapYear) elif (jDay > 59 + isLeapYear): month, day = 2, (jDay - 59 - isLeapYear) elif (jDay > 31): month, day = 1, (jDay - 31) else: month, day = 0, jDay return year, month, day months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']; def numDateToMonthDay(numDate): """Transform decimal year timestamp to string with only month and day""" + if not numDate: + return '?' year, month, day = numDateToYmd(numDate) return months[month] + str(day) def numDateToYmdStr(numDate): """Convert decimal year to YY-MM-DD string""" if numDate: year, month, day = numDateToYmd(numDate) return "%02d-%02d-%02d" % (year, month+1, day) else: return '' def rUnpackNextstrainTree(branch, parentVariants, parentVarStr): """Recursively descend ncov.tree and build data structures for genome browser tracks""" # Gather variants specific to this node/branch (if any) localVariants = [] @@ -229,35 +234,39 @@ if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 nodeAttrs = branch['node_attrs'] if (nodeAttrs.get('submitting_lab')): lab = nodeAttrs['submitting_lab']['value'] else: lab = '' if (nodeAttrs.get('legacy_clade_membership')): oldClade = nodeAttrs['legacy_clade_membership']['value'] else: oldClade = '' - samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'], + epiNode = nodeAttrs.get('gisaid_epi_isl') + epiId = epiNode['value'] if epiNode else branch['name'] + numDateNode = nodeAttrs.get('num_date') + numDate = numDateNode['value'] if numDateNode else '' + samples.append({ 'id': epiId, 'name': branch['name'], 'clade': nodeAttrs['clade_membership']['value'], 'oldClade': oldClade, - 'date': numDateToMonthDay(nodeAttrs['num_date']['value']), + 'date': numDateToMonthDay(numDate), 'lab': lab, 'variants': branchVariants, 'varStr': branchVarStr }) rUnpackNextstrainTree(ncov['tree'], {}, '') def sampleName(sample): return '|'.join([sample['id'], sample['name'], sample['date']]) sampleCount = len(samples) sampleNames = [ sampleName(sample) for sample in samples ] # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): @@ -377,31 +386,32 @@ vcfForAll('nextstrainSamples.vcf', 'clade') # I'll skip writing an enormous file with wasteful clades-in-genotypes... really need # a sample metadata file! # Assign samples to clades; a sample can appear in multiple clades if they are nested. def sampleIdsFromNode(node, cladeTops=()): """Return a list of IDs of all samples found under node.""" kids = node.get('children') if (kids): sampleIds = [] for kid in kids: if (kid not in cladeTops): sampleIds += sampleIdsFromNode(kid, cladeTops) else: - sampleId = node['node_attrs']['gisaid_epi_isl']['value'] + epiNode = node['node_attrs'].get('gisaid_epi_isl') + sampleId = epiNode['value'] if epiNode else node['name'] sampleIds = [sampleId] return sampleIds cladeSampleCounts = {} cladeSampleNames = {} def vcfForClades(clades, cladeTops=()): """Given a set of clades (old or new), dump out VCF for each clade. Stop at nodes in cladeTops (for Nextstrain's new clade scheme where 19A is root, 20A fully contains 20B and 20C, etc.""" for cladeName in clades: node = clades[cladeName]['topNode'] cladeSampleIds = set(sampleIdsFromNode(node, cladeTops)) cladeSampleList = [ sample for sample in samples if sample['id'] in cladeSampleIds ] cladeSampleCounts[cladeName] = len(cladeSampleList) @@ -514,33 +524,35 @@ nodeAttrs = node['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] elif (parentClade): cladeName = parentClade else: cladeName = 'unassigned' color = str(cladeRgbFromName(cladeName, cladeColors)) descendants = ','.join([ rNextstrainToNewick(child, cladeColors, cladeTops, cladeName, varStr) for child in kids if child not in cladeTops ]) label = '#'.join([cladeName, varStr]) treeString = '(' + descendants + ')' + label + ':' + color else: nodeAttrs = node['node_attrs'] - gId = nodeAttrs['gisaid_epi_isl']['value'] + epiNode = nodeAttrs.get('gisaid_epi_isl') + gId = epiNode['value'] if epiNode else node['name'] name = node['name'] - date = numDateToMonthDay(nodeAttrs['num_date']['value']) + numDateNode = nodeAttrs.get('num_date') + date = numDateNode['value'] if numDateNode else '' cladeName = nodeAttrs['clade_membership']['value'] color = str(cladeRgbFromName(cladeName, cladeColors)) treeString = sampleName({ 'id': gId, 'name': name, 'date': date }) + ':' + color return treeString with open('nextstrain.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], newCladeColors) + ';\n') if (len(oldClades)): with open('nextstrainOldCladeColors.nh', 'w') as outF: outF.write(rNextstrainToNewick(ncov['tree'], oldCladeColors) + ';\n') def newickForClades(clades, cladeColors, cladeTops=()): for cladeName in clades: filename = 'nextstrain' + cladeName + '.nh'