ac7cce26f5877b75218e08eec52eca476ddc33cc angie Mon Mar 30 14:52:32 2020 -0700 Add per-clade subtracks to Nextstrain Vars for David. Also, Nextstrain changed ncov.json to use country instead of division for location. refs #25188 diff --git src/hg/utils/otto/nextstrainNcov/nextstrain.py src/hg/utils/otto/nextstrainNcov/nextstrain.py index 373713d..4c3971e 100755 --- src/hg/utils/otto/nextstrainNcov/nextstrain.py +++ src/hg/utils/otto/nextstrainNcov/nextstrain.py @@ -65,39 +65,39 @@ clade['thickStart'] = min(snvStarts) clade['thickEnd'] = max(snvEnds) clade['name'] = name clade['color'] = cladeColorFromName(name) clade['varSizes'] = snvSizes clade['varStarts'] = snvStarts clade['varNames'] = varStr return clade def addDatesToClade(clade, numDateAttrs): """Add the numeric dates from ncov.json node_attrs.num_date to clade record""" clade['dateInferred'] = numDateAttrs['value'] clade['dateConfMin'] = numDateAttrs['confidence'][0] clade['dateConfMax'] = numDateAttrs['confidence'][1] -def addDivisionToClade(clade, divisionAttrs): - """Add the administrative division (location) data from ncov.json node_attrs.division to clade""" - clade['divInferred'] = divisionAttrs['value'] +def addCountryToClade(clade, countryAttrs): + """Add country data from ncov.json node_attrs.country to clade""" + clade['countryInferred'] = countryAttrs['value'] confString = '' - for div, conf in divisionAttrs['confidence'].items(): + for country, conf in countryAttrs['confidence'].items(): if (len(confString)): confString += ', ' - confString += "%s: %0.5f" % (div, conf) - clade['divConf'] = confString + confString += "%s: %0.5f" % (country, conf) + clade['countryConf'] = confString def numDateToYmd(numDate): """Convert numeric date (decimal year) to integer year, month, day""" year = int(numDate) isLeapYear = 1 if (year % 4 == 0) else 0 # Get rid of the year numDate -= year # Convert to Julian day jDay = int(numDate * 365) + 1 if (jDay > 334 + isLeapYear): month, day = 11, (jDay - 334 - isLeapYear) elif (jDay > 304 + isLeapYear): month, day = 10, (jDay - 304 - isLeapYear) elif (jDay > 273 + isLeapYear): month, day = 9, (jDay - 273 - isLeapYear) @@ -178,31 +178,36 @@ for varName in localVariants: if (len(branchVarStr)): branchVarStr += ', ' branchVarStr += varName aaVar = variantAaChanges.get(varName) if (aaVar): branchVarStr += ' (' + aaVar + ')' if (len(parentVarStr) and len(branchVarStr)): branchVarStr = parentVarStr + '; ' + branchVarStr nodeAttrs = branch['node_attrs'] if (nodeAttrs.get('clade_membership')): cladeName = nodeAttrs['clade_membership']['value'] if (not cladeName in clades): clades[cladeName] = cladeFromVariants(cladeName, branchVariants, branchVarStr) addDatesToClade(clades[cladeName], nodeAttrs['num_date']) - addDivisionToClade(clades[cladeName], nodeAttrs['division']) + if (nodeAttrs.get('country')): + addCountryToClade(clades[cladeName], nodeAttrs['country']) + elif (nodeAttrs.get('division')): + addCountryToClade(clades[cladeName], nodeAttrs['division']) + else: + warn('No country or division for new clade ' + cladeName) kids = branch.get('children') if (kids): for child in kids: rUnpackNextstrainTree(child, branchVariants, branchVarStr); else: for varName in branchVariants: variantCounts[varName] += 1 samples.append({ 'id': nodeAttrs['gisaid_epi_isl']['value'], 'name': branch['name'], 'clade': nodeAttrs['clade_membership']['value'], 'date': numDateToMonthDay(nodeAttrs['num_date']['value']), 'variants': branchVariants }) if (cladeName): if (clades[cladeName].get('sampleCount')): clades[cladeName]['sampleCount'] += 1 @@ -221,59 +226,96 @@ # Parse variant names like 'G11083T' into pos and alleles; bundle in VCF column order parsedVars = [] for varName in variantCounts.keys(): m = snvRe.match(varName) if (m): ref, pos, alt = m.groups() parsedVars.append([int(pos), varName, ref, alt]) else: warn("Can't match " + varName) # Sort by position def parsedVarPos(pv): return pv[0] parsedVars.sort(key=parsedVarPos) def boolToStr01(bool): + """Convert boolean to string 1 or 0.""" if (bool): return '1' else: return '0' +def writeVcfHeaderExceptSamples(outF): + """Write VCF header lines -- except for sample names (this ends with a \t not a \n).""" + outF.write('##fileformat=VCFv4.3\n') + outF.write('##source=nextstrain.org\n') + outF.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) + + '\t'); + +# VCF with open('nextstrainSamples.vcf', 'w') as outC: - outC.write('##fileformat=VCFv4.3\n') - outC.write('##source=nextstrain.org\n') - outC.write('\t'.join(['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'FORMAT']) + - '\t' + '\t'.join(sampleNames) + '\n') + writeVcfHeaderExceptSamples(outC) + outC.write('\t'.join(sampleNames) + '\n') for pv in parsedVars: varName = pv[1] info = 'AC=' + str(variantCounts[varName]) + ';AN=' + str(sampleCount) aaChange = variantAaChanges.get(varName) if (aaChange): info += ';AACHANGE=' + aaChange genotypes = [] for sample in samples: gt = boolToStr01(sample['variants'].get(varName)) genotypes.append(gt + ':' + sample['clade']) outC.write('\t'.join([ chrom, '\t'.join(map(str, pv)), '\t'.join(['.', 'PASS', info, 'GT:CLADE']), '\t'.join(genotypes) ]) + '\n') outC.close() +# Per-clade VCF subset +for cladeName in clades: + if cladeName != 'unassigned': + cladeSamples = [ sample for sample in samples if (sample['clade'] == cladeName) ] + cladeSampleCount = len(cladeSamples) + cladeSampleNames = [ sample['name'] for sample in cladeSamples ] + with open('nextstrainSamples' + cladeName + '.vcf', 'w') as outV: + writeVcfHeaderExceptSamples(outV) + outV.write('\t'.join(cladeSampleNames) + '\n') + for pv in parsedVars: + varName = pv[1] + genotypes = [] + ac=0 + for sample in cladeSamples: + gt = boolToStr01(sample['variants'].get(varName)) + genotypes.append(gt) + if (sample['variants'].get(varName)): + ac += 1 + if (ac > 0): + info = 'AC=' + str(ac) + ';AN=' + str(cladeSampleCount) + aaChange = variantAaChanges.get(varName) + if (aaChange): + info += ';AACHANGE=' + aaChange + outV.write('\t'.join([ chrom, + '\t'.join(map(str, pv)), + '\t'.join(['.', 'PASS', info, 'GT']), + '\t'.join(genotypes) ]) + '\n') + outV.close() + +# BED+ file for clades with open('nextstrainClade.bed', 'w') as outC: for name, clade in clades.items(): if (clade.get('thickStart')): outC.write('\t'.join(map(str, [ chrom, 0, 29903, name, 0, '.', clade['thickStart'], clade['thickEnd'], clade['color'], len(clade['varSizes']) + 2, '1,' + ','.join(map(str, clade['varSizes'])) + ',1,', '0,' + ','.join(map(str, clade['varStarts'])) + ',29902,', clade['varNames'], numDateToYmdStr(clade['dateInferred']), numDateToYmdStr(clade['dateConfMin']), numDateToYmdStr(clade['dateConfMax']), - clade['divInferred'], - clade['divConf'], + clade['countryInferred'], + clade['countryConf'], clade['sampleCount'], ', '.join(clade['samples']) ])) + '\n') outC.close()